From 959d00e9dbe4cfcf4a63bb655ac2c29a5e579246 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 5 Apr 2019 19:20:30 -0400 Subject: [PATCH] Use Append rather than MergeAppend for scanning ordered partitions. If we need ordered output from a scan of a partitioned table, but the ordering matches the partition ordering, then we don't need to use a MergeAppend to combine the pre-ordered per-partition scan results: a plain Append will produce the same results. This both saves useless comparison work inside the MergeAppend proper, and allows us to start returning tuples after istarting up just the first child node not all of them. However, all is not peaches and cream, because if some of the child nodes have high startup costs then there will be big discontinuities in the tuples-returned-versus-elapsed-time curve. The planner's cost model cannot handle that (yet, anyway). If we model the Append's startup cost as being just the first child's startup cost, we may drastically underestimate the cost of fetching slightly more tuples than are available from the first child. Since we've had bad experiences with over-optimistic choices of "fast start" plans for ORDER BY LIMIT queries, that seems scary. As a klugy workaround, set the startup cost estimate for an ordered Append to be the sum of its children's startup costs (as MergeAppend would). This doesn't really describe reality, but it's less likely to cause a bad plan choice than an underestimated startup cost would. In practice, the cases where we really care about this optimization will have child plans that are IndexScans with zero startup cost, so that the overly conservative estimate is still just zero. David Rowley, reviewed by Julien Rouhaud and Antonin Houska Discussion: https://postgr.es/m/CAKJS1f-hAqhPLRk_RaSFTgYxd=Tz5hA7kQ2h4-DhJufQk8TGuw@mail.gmail.com --- src/backend/executor/execProcnode.c | 13 + src/backend/nodes/outfuncs.c | 1 + src/backend/optimizer/path/allpaths.c | 237 +++++++++++++--- src/backend/optimizer/path/costsize.c | 81 +++++- src/backend/optimizer/path/joinrels.c | 2 +- src/backend/optimizer/path/pathkeys.c | 164 ++++++++++++ src/backend/optimizer/plan/createplan.c | 126 +++++++-- src/backend/optimizer/plan/planner.c | 4 +- src/backend/optimizer/prep/prepunion.c | 7 +- src/backend/optimizer/util/pathnode.c | 30 ++- src/backend/partitioning/partbounds.c | 64 +++++ src/include/nodes/pathnodes.h | 9 +- src/include/optimizer/pathnode.h | 2 +- src/include/optimizer/paths.h | 2 + src/include/partitioning/partbounds.h | 1 + src/test/regress/expected/inherit.out | 253 +++++++++++++++++- src/test/regress/expected/partition_prune.out | 64 +++-- src/test/regress/sql/inherit.sql | 109 +++++++- src/test/regress/sql/partition_prune.sql | 10 +- 19 files changed, 1044 insertions(+), 135 deletions(-) diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 4ab290360a..c227282975 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -840,6 +840,19 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) sortState->bound = tuples_needed; } } + else if (IsA(child_node, AppendState)) + { + /* + * If it is an Append, we can apply the bound to any nodes that are + * children of the Append, since the Append surely need read no more + * than that many tuples from any one input. + */ + AppendState *aState = (AppendState *) child_node; + int i; + + for (i = 0; i < aState->as_nplans; i++) + ExecSetTupleBound(tuples_needed, aState->appendplans[i]); + } else if (IsA(child_node, MergeAppendState)) { /* diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 3282be0e4b..82ca6826ab 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1847,6 +1847,7 @@ _outAppendPath(StringInfo str, const AppendPath *node) WRITE_NODE_FIELD(partitioned_rels); WRITE_NODE_FIELD(subpaths); WRITE_INT_FIELD(first_partial_path); + WRITE_FLOAT_FIELD(limit_tuples, "%.0f"); } static void diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 727da33338..af05bb7511 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -44,6 +44,7 @@ #include "optimizer/tlist.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#include "partitioning/partbounds.h" #include "partitioning/partprune.h" #include "rewrite/rewriteManip.h" #include "utils/lsyscache.h" @@ -96,15 +97,16 @@ static void set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); static void set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); -static void generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, - List *live_childrels, - List *all_child_pathkeys, - List *partitioned_rels); +static void generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, + List *live_childrels, + List *all_child_pathkeys, + List *partitioned_rels); static Path *get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer); static void accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths); +static Path *get_singleton_append_subpath(Path *path); static void set_dummy_rel_pathlist(RelOptInfo *rel); static void set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); @@ -1520,7 +1522,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, */ if (subpaths_valid) add_path(rel, (Path *) create_append_path(root, rel, subpaths, NIL, - NULL, 0, false, + NIL, NULL, 0, false, partitioned_rels, -1)); /* @@ -1562,7 +1564,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, /* Generate a partial append path. */ appendpath = create_append_path(root, rel, NIL, partial_subpaths, - NULL, parallel_workers, + NIL, NULL, parallel_workers, enable_parallel_append, partitioned_rels, -1); @@ -1612,19 +1614,19 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, appendpath = create_append_path(root, rel, pa_nonpartial_subpaths, pa_partial_subpaths, - NULL, parallel_workers, true, + NIL, NULL, parallel_workers, true, partitioned_rels, partial_rows); add_partial_path(rel, (Path *) appendpath); } /* - * Also build unparameterized MergeAppend paths based on the collected + * Also build unparameterized ordered append paths based on the collected * list of child pathkeys. */ if (subpaths_valid) - generate_mergeappend_paths(root, rel, live_childrels, - all_child_pathkeys, - partitioned_rels); + generate_orderedappend_paths(root, rel, live_childrels, + all_child_pathkeys, + partitioned_rels); /* * Build Append paths for each parameterization seen among the child rels. @@ -1674,7 +1676,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (subpaths_valid) add_path(rel, (Path *) create_append_path(root, rel, subpaths, NIL, - required_outer, 0, false, + NIL, required_outer, 0, false, partitioned_rels, -1)); } @@ -1703,8 +1705,8 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, continue; appendpath = create_append_path(root, rel, NIL, list_make1(path), - NULL, path->parallel_workers, - true, + NIL, NULL, + path->parallel_workers, true, partitioned_rels, partial_rows); add_partial_path(rel, (Path *) appendpath); } @@ -1712,17 +1714,21 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, } /* - * generate_mergeappend_paths - * Generate MergeAppend paths for an append relation + * generate_orderedappend_paths + * Generate ordered append paths for an append relation * - * Generate a path for each ordering (pathkey list) appearing in + * Usually we generate MergeAppend paths here, but there are some special + * cases where we can generate simple Append paths, because the subpaths + * can provide tuples in the required order already. + * + * We generate a path for each ordering (pathkey list) appearing in * all_child_pathkeys. * * We consider both cheapest-startup and cheapest-total cases, ie, for each * interesting ordering, collect all the cheapest startup subpaths and all the - * cheapest total paths, and build a MergeAppend path for each case. + * cheapest total paths, and build a suitable path for each case. * - * We don't currently generate any parameterized MergeAppend paths. While + * We don't currently generate any parameterized ordered paths here. While * it would not take much more code here to do so, it's very unclear that it * is worth the planning cycles to investigate such paths: there's little * use for an ordered path on the inside of a nestloop. In fact, it's likely @@ -1731,17 +1737,52 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * and a parameterized MergeAppend is going to be more expensive than the * corresponding parameterized Append path. If we ever try harder to support * parameterized mergejoin plans, it might be worth adding support for - * parameterized MergeAppends to feed such joins. (See notes in + * parameterized paths here to feed such joins. (See notes in * optimizer/README for why that might not ever happen, though.) */ static void -generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, - List *live_childrels, - List *all_child_pathkeys, - List *partitioned_rels) +generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, + List *live_childrels, + List *all_child_pathkeys, + List *partitioned_rels) { ListCell *lcp; + List *partition_pathkeys = NIL; + List *partition_pathkeys_desc = NIL; + bool partition_pathkeys_partial = true; + bool partition_pathkeys_desc_partial = true; + + /* + * Some partitioned table setups may allow us to use an Append node + * instead of a MergeAppend. This is possible in cases such as RANGE + * partitioned tables where it's guaranteed that an earlier partition must + * contain rows which come earlier in the sort order. To detect whether + * this is relevant, build pathkey descriptions of the partition ordering, + * for both forward and reverse scans. + */ + if (rel->part_scheme != NULL && IS_SIMPLE_REL(rel) && + partitions_are_ordered(rel->boundinfo, rel->nparts)) + { + partition_pathkeys = build_partition_pathkeys(root, rel, + ForwardScanDirection, + &partition_pathkeys_partial); + + partition_pathkeys_desc = build_partition_pathkeys(root, rel, + BackwardScanDirection, + &partition_pathkeys_desc_partial); + + /* + * You might think we should truncate_useless_pathkeys here, but + * allowing partition keys which are a subset of the query's pathkeys + * can often be useful. For example, consider a table partitioned by + * RANGE (a, b), and a query with ORDER BY a, b, c. If we have child + * paths that can produce the a, b, c ordering (perhaps via indexes on + * (a, b, c)) then it works to consider the appendrel output as + * ordered by a, b, c. + */ + } + /* Now consider each interesting sort ordering */ foreach(lcp, all_child_pathkeys) { List *pathkeys = (List *) lfirst(lcp); @@ -1749,6 +1790,27 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, List *total_subpaths = NIL; bool startup_neq_total = false; ListCell *lcr; + bool match_partition_order; + bool match_partition_order_desc; + + /* + * Determine if this sort ordering matches any partition pathkeys we + * have, for both ascending and descending partition order. If the + * partition pathkeys happen to be contained in pathkeys then it still + * works, as described above, providing that the partition pathkeys + * are complete and not just a prefix of the partition keys. (In such + * cases we'll be relying on the child paths to have sorted the + * lower-order columns of the required pathkeys.) + */ + match_partition_order = + pathkeys_contained_in(pathkeys, partition_pathkeys) || + (!partition_pathkeys_partial && + pathkeys_contained_in(partition_pathkeys, pathkeys)); + + match_partition_order_desc = !match_partition_order && + (pathkeys_contained_in(pathkeys, partition_pathkeys_desc) || + (!partition_pathkeys_desc_partial && + pathkeys_contained_in(partition_pathkeys_desc, pathkeys))); /* Select the child paths for this ordering... */ foreach(lcr, live_childrels) @@ -1791,26 +1853,94 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, if (cheapest_startup != cheapest_total) startup_neq_total = true; - accumulate_append_subpath(cheapest_startup, - &startup_subpaths, NULL); - accumulate_append_subpath(cheapest_total, - &total_subpaths, NULL); + /* + * Collect the appropriate child paths. The required logic varies + * for the Append and MergeAppend cases. + */ + if (match_partition_order) + { + /* + * We're going to make a plain Append path. We don't need + * most of what accumulate_append_subpath would do, but we do + * want to cut out child Appends or MergeAppends if they have + * just a single subpath (and hence aren't doing anything + * useful). + */ + cheapest_startup = get_singleton_append_subpath(cheapest_startup); + cheapest_total = get_singleton_append_subpath(cheapest_total); + + startup_subpaths = lappend(startup_subpaths, cheapest_startup); + total_subpaths = lappend(total_subpaths, cheapest_total); + } + else if (match_partition_order_desc) + { + /* + * As above, but we need to reverse the order of the children, + * because nodeAppend.c doesn't know anything about reverse + * ordering and will scan the children in the order presented. + */ + cheapest_startup = get_singleton_append_subpath(cheapest_startup); + cheapest_total = get_singleton_append_subpath(cheapest_total); + + startup_subpaths = lcons(cheapest_startup, startup_subpaths); + total_subpaths = lcons(cheapest_total, total_subpaths); + } + else + { + /* + * Otherwise, rely on accumulate_append_subpath to collect the + * child paths for the MergeAppend. + */ + accumulate_append_subpath(cheapest_startup, + &startup_subpaths, NULL); + accumulate_append_subpath(cheapest_total, + &total_subpaths, NULL); + } } - /* ... and build the MergeAppend paths */ - add_path(rel, (Path *) create_merge_append_path(root, - rel, - startup_subpaths, - pathkeys, - NULL, - partitioned_rels)); - if (startup_neq_total) + /* ... and build the Append or MergeAppend paths */ + if (match_partition_order || match_partition_order_desc) + { + /* We only need Append */ + add_path(rel, (Path *) create_append_path(root, + rel, + startup_subpaths, + NIL, + pathkeys, + NULL, + 0, + false, + partitioned_rels, + -1)); + if (startup_neq_total) + add_path(rel, (Path *) create_append_path(root, + rel, + total_subpaths, + NIL, + pathkeys, + NULL, + 0, + false, + partitioned_rels, + -1)); + } + else + { + /* We need MergeAppend */ add_path(rel, (Path *) create_merge_append_path(root, rel, - total_subpaths, + startup_subpaths, pathkeys, NULL, partitioned_rels)); + if (startup_neq_total) + add_path(rel, (Path *) create_merge_append_path(root, + rel, + total_subpaths, + pathkeys, + NULL, + partitioned_rels)); + } } } @@ -1901,7 +2031,6 @@ get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel, * omitting a sort step, which seems fine: if the parent is to be an Append, * its result would be unsorted anyway, while if the parent is to be a * MergeAppend, there's no point in a separate sort on a child. - * its result would be unsorted anyway. * * Normally, either path is a partial path and subpaths is a list of partial * paths, or else path is a non-partial plan and subpaths is a list of those. @@ -1951,6 +2080,36 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) *subpaths = lappend(*subpaths, path); } +/* + * get_singleton_append_subpath + * Returns the single subpath of an Append/MergeAppend, or just + * return 'path' if it's not a single sub-path Append/MergeAppend. + * + * Note: 'path' must not be a parallel-aware path. + */ +static Path * +get_singleton_append_subpath(Path *path) +{ + Assert(!path->parallel_aware); + + if (IsA(path, AppendPath)) + { + AppendPath *apath = (AppendPath *) path; + + if (list_length(apath->subpaths) == 1) + return (Path *) linitial(apath->subpaths); + } + else if (IsA(path, MergeAppendPath)) + { + MergeAppendPath *mpath = (MergeAppendPath *) path; + + if (list_length(mpath->subpaths) == 1) + return (Path *) linitial(mpath->subpaths); + } + + return path; +} + /* * set_dummy_rel_pathlist * Build a dummy path for a relation that's been excluded by constraints @@ -1975,7 +2134,7 @@ set_dummy_rel_pathlist(RelOptInfo *rel) /* Set up the dummy path */ add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL, - rel->lateral_relids, + NIL, rel->lateral_relids, 0, false, NIL, -1)); /* diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 4b9be13f08..afd32884a2 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1878,27 +1878,83 @@ cost_append(AppendPath *apath) apath->path.startup_cost = 0; apath->path.total_cost = 0; + apath->path.rows = 0; if (apath->subpaths == NIL) return; if (!apath->path.parallel_aware) { - Path *subpath = (Path *) linitial(apath->subpaths); + List *pathkeys = apath->path.pathkeys; - /* - * Startup cost of non-parallel-aware Append is the startup cost of - * first subpath. - */ - apath->path.startup_cost = subpath->startup_cost; + if (pathkeys == NIL) + { + Path *subpath = (Path *) linitial(apath->subpaths); - /* Compute rows and costs as sums of subplan rows and costs. */ - foreach(l, apath->subpaths) + /* + * For an unordered, non-parallel-aware Append we take the startup + * cost as the startup cost of the first subpath. + */ + apath->path.startup_cost = subpath->startup_cost; + + /* Compute rows and costs as sums of subplan rows and costs. */ + foreach(l, apath->subpaths) + { + Path *subpath = (Path *) lfirst(l); + + apath->path.rows += subpath->rows; + apath->path.total_cost += subpath->total_cost; + } + } + else { - Path *subpath = (Path *) lfirst(l); + /* + * For an ordered, non-parallel-aware Append we take the startup + * cost as the sum of the subpath startup costs. This ensures + * that we don't underestimate the startup cost when a query's + * LIMIT is such that several of the children have to be run to + * satisfy it. This might be overkill --- another plausible hack + * would be to take the Append's startup cost as the maximum of + * the child startup costs. But we don't want to risk believing + * that an ORDER BY LIMIT query can be satisfied at small cost + * when the first child has small startup cost but later ones + * don't. (If we had the ability to deal with nonlinear cost + * interpolation for partial retrievals, we would not need to be + * so conservative about this.) + * + * This case is also different from the above in that we have to + * account for possibly injecting sorts into subpaths that aren't + * natively ordered. + */ + foreach(l, apath->subpaths) + { + Path *subpath = (Path *) lfirst(l); + Path sort_path; /* dummy for result of cost_sort */ - apath->path.rows += subpath->rows; - apath->path.total_cost += subpath->total_cost; + if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + { + /* + * We'll need to insert a Sort node, so include costs for + * that. We can use the parent's LIMIT if any, since we + * certainly won't pull more than that many tuples from + * any child. + */ + cost_sort(&sort_path, + NULL, /* doesn't currently need root */ + pathkeys, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + subpath = &sort_path; + } + + apath->path.rows += subpath->rows; + apath->path.startup_cost += subpath->startup_cost; + apath->path.total_cost += subpath->total_cost; + } } } else /* parallel-aware */ @@ -1906,6 +1962,9 @@ cost_append(AppendPath *apath) int i = 0; double parallel_divisor = get_parallel_divisor(&apath->path); + /* Parallel-aware Append never produces ordered output. */ + Assert(apath->path.pathkeys == NIL); + /* Calculate startup cost. */ foreach(l, apath->subpaths) { diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 34cc7dacdf..e66cf328be 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1261,7 +1261,7 @@ mark_dummy_rel(RelOptInfo *rel) /* Set up the dummy path */ add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL, - rel->lateral_relids, + NIL, rel->lateral_relids, 0, false, NIL, -1)); /* Set or update cheapest_total_path and related fields */ diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 56d839bb31..68b2204b3b 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -18,16 +18,21 @@ #include "postgres.h" #include "access/stratnum.h" +#include "catalog/pg_opfamily.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "nodes/plannodes.h" #include "optimizer/optimizer.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" +#include "partitioning/partbounds.h" #include "utils/lsyscache.h" static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys); +static bool matches_boolean_partition_clause(RestrictInfo *rinfo, + RelOptInfo *partrel, + int partkeycol); static bool right_merge_direction(PlannerInfo *root, PathKey *pathkey); @@ -546,6 +551,165 @@ build_index_pathkeys(PlannerInfo *root, return retval; } +/* + * partkey_is_bool_constant_for_query + * + * If a partition key column is constrained to have a constant value by the + * query's WHERE conditions, then it's irrelevant for sort-order + * considerations. Usually that means we have a restriction clause + * WHERE partkeycol = constant, which gets turned into an EquivalenceClass + * containing a constant, which is recognized as redundant by + * build_partition_pathkeys(). But if the partition key column is a + * boolean variable (or expression), then we are not going to see such a + * WHERE clause, because expression preprocessing will have simplified it + * to "WHERE partkeycol" or "WHERE NOT partkeycol". So we are not going + * to have a matching EquivalenceClass (unless the query also contains + * "ORDER BY partkeycol"). To allow such cases to work the same as they would + * for non-boolean values, this function is provided to detect whether the + * specified partition key column matches a boolean restriction clause. + */ +static bool +partkey_is_bool_constant_for_query(RelOptInfo *partrel, int partkeycol) +{ + PartitionScheme partscheme = partrel->part_scheme; + ListCell *lc; + + /* If the partkey isn't boolean, we can't possibly get a match */ + if (!IsBooleanOpfamily(partscheme->partopfamily[partkeycol])) + return false; + + /* Check each restriction clause for the partitioned rel */ + foreach(lc, partrel->baserestrictinfo) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + + /* Ignore pseudoconstant quals, they won't match */ + if (rinfo->pseudoconstant) + continue; + + /* See if we can match the clause's expression to the partkey column */ + if (matches_boolean_partition_clause(rinfo, partrel, partkeycol)) + return true; + } + + return false; +} + +/* + * matches_boolean_partition_clause + * Determine if the boolean clause described by rinfo matches + * partrel's partkeycol-th partition key column. + * + * "Matches" can be either an exact match (equivalent to partkey = true), + * or a NOT above an exact match (equivalent to partkey = false). + */ +static bool +matches_boolean_partition_clause(RestrictInfo *rinfo, + RelOptInfo *partrel, int partkeycol) +{ + Node *clause = (Node *) rinfo->clause; + Node *partexpr = (Node *) linitial(partrel->partexprs[partkeycol]); + + /* Direct match? */ + if (equal(partexpr, clause)) + return true; + /* NOT clause? */ + else if (is_notclause(clause)) + { + Node *arg = (Node *) get_notclausearg((Expr *) clause); + + if (equal(partexpr, arg)) + return true; + } + + return false; +} + +/* + * build_partition_pathkeys + * Build a pathkeys list that describes the ordering induced by the + * partitions of partrel, under either forward or backward scan + * as per scandir. + * + * Caller must have checked that the partitions are properly ordered, + * as detected by partitions_are_ordered(). + * + * Sets *partialkeys to true if pathkeys were only built for a prefix of the + * partition key, or false if the pathkeys include all columns of the + * partition key. + */ +List * +build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel, + ScanDirection scandir, bool *partialkeys) +{ + List *retval = NIL; + PartitionScheme partscheme = partrel->part_scheme; + int i; + + Assert(partscheme != NULL); + Assert(partitions_are_ordered(partrel->boundinfo, partrel->nparts)); + /* For now, we can only cope with baserels */ + Assert(IS_SIMPLE_REL(partrel)); + + for (i = 0; i < partscheme->partnatts; i++) + { + PathKey *cpathkey; + Expr *keyCol = (Expr *) linitial(partrel->partexprs[i]); + + /* + * Try to make a canonical pathkey for this partkey. + * + * We're considering a baserel scan, so nullable_relids should be + * NULL. Also, we assume the PartitionDesc lists any NULL partition + * last, so we treat the scan like a NULLS LAST index: we have + * nulls_first for backwards scan only. + */ + cpathkey = make_pathkey_from_sortinfo(root, + keyCol, + NULL, + partscheme->partopfamily[i], + partscheme->partopcintype[i], + partscheme->partcollation[i], + ScanDirectionIsBackward(scandir), + ScanDirectionIsBackward(scandir), + 0, + partrel->relids, + false); + + + if (cpathkey) + { + /* + * We found the sort key in an EquivalenceClass, so it's relevant + * for this query. Add it to list, unless it's redundant. + */ + if (!pathkey_is_redundant(cpathkey, retval)) + retval = lappend(retval, cpathkey); + } + else + { + /* + * Boolean partition keys might be redundant even if they do not + * appear in an EquivalenceClass, because of our special treatment + * of boolean equality conditions --- see the comment for + * partkey_is_bool_constant_for_query(). If that applies, we can + * continue to examine lower-order partition keys. Otherwise, the + * sort key is not an interesting sort order for this query, so we + * should stop considering partition columns; any lower-order sort + * keys won't be useful either. + */ + if (!partkey_is_bool_constant_for_query(partrel, i)) + { + *partialkeys = true; + return retval; + } + } + } + + *partialkeys = false; + return retval; +} + /* * build_expression_pathkey * Build a pathkeys list that describes an ordering by a single expression diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index cc222cb06c..efe073a3ee 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -205,8 +205,6 @@ static NamedTuplestoreScan *make_namedtuplestorescan(List *qptlist, List *qpqual Index scanrelid, char *enrname); static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual, Index scanrelid, int wtParam); -static Append *make_append(List *appendplans, int first_partial_plan, - List *tlist, PartitionPruneInfo *partpruneinfo); static RecursiveUnion *make_recursive_union(List *tlist, Plan *lefttree, Plan *righttree, @@ -1060,10 +1058,16 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path) { Append *plan; List *tlist = build_path_tlist(root, &best_path->path); + List *pathkeys = best_path->path.pathkeys; List *subplans = NIL; ListCell *subpaths; RelOptInfo *rel = best_path->path.parent; PartitionPruneInfo *partpruneinfo = NULL; + int nodenumsortkeys = 0; + AttrNumber *nodeSortColIdx = NULL; + Oid *nodeSortOperators = NULL; + Oid *nodeCollations = NULL; + bool *nodeNullsFirst = NULL; /* * The subpaths list could be empty, if every child was proven empty by @@ -1089,6 +1093,37 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path) return plan; } + /* + * Otherwise build an Append plan. Note that if there's just one child, + * the Append is pretty useless; but we wait till setrefs.c to get rid of + * it. Doing so here doesn't work because the varno of the child scan + * plan won't match the parent-rel Vars it'll be asked to emit. + * + * We don't have the actual creation of the Append node split out into a + * separate make_xxx function. This is because we want to run + * prepare_sort_from_pathkeys on it before we do so on the individual + * child plans, to make cross-checking the sort info easier. + */ + plan = makeNode(Append); + plan->plan.targetlist = tlist; + plan->plan.qual = NIL; + plan->plan.lefttree = NULL; + plan->plan.righttree = NULL; + + if (pathkeys != NIL) + { + /* Compute sort column info, and adjust the Append's tlist as needed */ + (void) prepare_sort_from_pathkeys((Plan *) plan, pathkeys, + best_path->path.parent->relids, + NULL, + true, + &nodenumsortkeys, + &nodeSortColIdx, + &nodeSortOperators, + &nodeCollations, + &nodeNullsFirst); + } + /* Build the plan for each child */ foreach(subpaths, best_path->subpaths) { @@ -1098,6 +1133,63 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path) /* Must insist that all children return the same tlist */ subplan = create_plan_recurse(root, subpath, CP_EXACT_TLIST); + /* + * For ordered Appends, we must insert a Sort node if subplan isn't + * sufficiently ordered. + */ + if (pathkeys != NIL) + { + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + Oid *collations; + bool *nullsFirst; + + /* + * Compute sort column info, and adjust subplan's tlist as needed. + * We must apply prepare_sort_from_pathkeys even to subplans that + * don't need an explicit sort, to make sure they are returning + * the same sort key columns the Append expects. + */ + subplan = prepare_sort_from_pathkeys(subplan, pathkeys, + subpath->parent->relids, + nodeSortColIdx, + false, + &numsortkeys, + &sortColIdx, + &sortOperators, + &collations, + &nullsFirst); + + /* + * Check that we got the same sort key information. We just + * Assert that the sortops match, since those depend only on the + * pathkeys; but it seems like a good idea to check the sort + * column numbers explicitly, to ensure the tlists match up. + */ + Assert(numsortkeys == nodenumsortkeys); + if (memcmp(sortColIdx, nodeSortColIdx, + numsortkeys * sizeof(AttrNumber)) != 0) + elog(ERROR, "Append child's targetlist doesn't match Append"); + Assert(memcmp(sortOperators, nodeSortOperators, + numsortkeys * sizeof(Oid)) == 0); + Assert(memcmp(collations, nodeCollations, + numsortkeys * sizeof(Oid)) == 0); + Assert(memcmp(nullsFirst, nodeNullsFirst, + numsortkeys * sizeof(bool)) == 0); + + /* Now, insert a Sort node if subplan isn't sufficiently ordered */ + if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + { + Sort *sort = make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, sort, best_path->limit_tuples); + subplan = (Plan *) sort; + } + } + subplans = lappend(subplans, subplan); } @@ -1133,15 +1225,9 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path) prunequal); } - /* - * And build the Append plan. Note that if there's just one child, the - * Append is pretty useless; but we wait till setrefs.c to get rid of it. - * Doing so here doesn't work because the varno of the child scan plan - * won't match the parent-rel Vars it'll be asked to emit. - */ - - plan = make_append(subplans, best_path->first_partial_path, - tlist, partpruneinfo); + plan->appendplans = subplans; + plan->first_partial_plan = best_path->first_partial_path; + plan->part_prune_info = partpruneinfo; copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -1266,7 +1352,6 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path) if (best_path->path.param_info) { - List *prmquals = best_path->path.param_info->ppi_clauses; prmquals = extract_actual_clauses(prmquals, false); @@ -5300,23 +5385,6 @@ make_foreignscan(List *qptlist, return node; } -static Append * -make_append(List *appendplans, int first_partial_plan, - List *tlist, PartitionPruneInfo *partpruneinfo) -{ - Append *node = makeNode(Append); - Plan *plan = &node->plan; - - plan->targetlist = tlist; - plan->qual = NIL; - plan->lefttree = NULL; - plan->righttree = NULL; - node->appendplans = appendplans; - node->first_partial_plan = first_partial_plan; - node->part_prune_info = partpruneinfo; - return node; -} - static RecursiveUnion * make_recursive_union(List *tlist, Plan *lefttree, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index e2cdc83613..cbd3fb8e0e 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1721,7 +1721,8 @@ inheritance_planner(PlannerInfo *root) /* Make a dummy path, cf set_dummy_rel_pathlist() */ dummy_path = (Path *) create_append_path(NULL, final_rel, NIL, NIL, - NULL, 0, false, NIL, -1); + NIL, NULL, 0, false, + NIL, -1); /* These lists must be nonempty to make a valid ModifyTable node */ subpaths = list_make1(dummy_path); @@ -4003,6 +4004,7 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, grouped_rel, paths, NIL, + NIL, NULL, 0, false, diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index eb815c2f12..c1def3823f 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -617,7 +617,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, * Append the child results together. */ path = (Path *) create_append_path(root, result_rel, pathlist, NIL, - NULL, 0, false, NIL, -1); + NIL, NULL, 0, false, NIL, -1); /* * For UNION ALL, we just need the Append path. For UNION, need to add @@ -672,7 +672,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, ppath = (Path *) create_append_path(root, result_rel, NIL, partial_pathlist, - NULL, parallel_workers, enable_parallel_append, + NIL, NULL, + parallel_workers, enable_parallel_append, NIL, -1); ppath = (Path *) create_gather_path(root, result_rel, ppath, @@ -783,7 +784,7 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root, * Append the child results together. */ path = (Path *) create_append_path(root, result_rel, pathlist, NIL, - NULL, 0, false, NIL, -1); + NIL, NULL, 0, false, NIL, -1); /* Identify the grouping semantics */ groupList = generate_setop_grouplist(op, tlist); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 1ea89ff54c..36aee35d46 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1203,12 +1203,13 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals, * pathnode. * * Note that we must handle subpaths = NIL, representing a dummy access path. + * Also, there are callers that pass root = NULL. */ AppendPath * create_append_path(PlannerInfo *root, RelOptInfo *rel, List *subpaths, List *partial_subpaths, - Relids required_outer, + List *pathkeys, Relids required_outer, int parallel_workers, bool parallel_aware, List *partitioned_rels, double rows) { @@ -1242,6 +1243,7 @@ create_append_path(PlannerInfo *root, pathnode->path.parallel_aware = parallel_aware; pathnode->path.parallel_safe = rel->consider_parallel; pathnode->path.parallel_workers = parallel_workers; + pathnode->path.pathkeys = pathkeys; pathnode->partitioned_rels = list_copy(partitioned_rels); /* @@ -1255,6 +1257,13 @@ create_append_path(PlannerInfo *root, */ if (pathnode->path.parallel_aware) { + /* + * We mustn't fiddle with the order of subpaths when the Append has + * pathkeys. The order they're listed in is critical to keeping the + * pathkeys valid. + */ + Assert(pathkeys == NIL); + subpaths = list_qsort(subpaths, append_total_cost_compare); partial_subpaths = list_qsort(partial_subpaths, append_startup_cost_compare); @@ -1262,6 +1271,15 @@ create_append_path(PlannerInfo *root, pathnode->first_partial_path = list_length(subpaths); pathnode->subpaths = list_concat(subpaths, partial_subpaths); + /* + * Apply query-wide LIMIT if known and path is for sole base relation. + * (Handling this at this low level is a bit klugy.) + */ + if (root != NULL && bms_equal(rel->relids, root->all_baserels)) + pathnode->limit_tuples = root->limit_tuples; + else + pathnode->limit_tuples = -1.0; + foreach(l, pathnode->subpaths) { Path *subpath = (Path *) lfirst(l); @@ -1278,8 +1296,9 @@ create_append_path(PlannerInfo *root, /* * If there's exactly one child path, the Append is a no-op and will be * discarded later (in setrefs.c); therefore, we can inherit the child's - * size, cost, and pathkeys if any. Otherwise, it's unsorted, and we must - * do the normal costsize calculation. + * size and cost, as well as its pathkeys if any (overriding whatever the + * caller might've said). Otherwise, we must do the normal costsize + * calculation. */ if (list_length(pathnode->subpaths) == 1) { @@ -1291,10 +1310,7 @@ create_append_path(PlannerInfo *root, pathnode->path.pathkeys = child->pathkeys; } else - { - pathnode->path.pathkeys = NIL; /* unsorted if more than 1 subpath */ cost_append(pathnode); - } /* If the caller provided a row estimate, override the computed value. */ if (rows >= 0) @@ -3759,7 +3775,7 @@ reparameterize_path(PlannerInfo *root, Path *path, } return (Path *) create_append_path(root, rel, childpaths, partialpaths, - required_outer, + apath->path.pathkeys, required_outer, apath->path.parallel_workers, apath->path.parallel_aware, apath->partitioned_rels, diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index bdd0d23854..c8770ccfee 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -861,6 +861,70 @@ partition_bounds_copy(PartitionBoundInfo src, return dest; } +/* + * partitions_are_ordered + * Determine whether the partitions described by 'boundinfo' are ordered, + * that is partitions appearing earlier in the PartitionDesc sequence + * contain partition keys strictly less than those appearing later. + * Also, if NULL values are possible, they must come in the last + * partition defined in the PartitionDesc. + * + * If out of order, or there is insufficient info to know the order, + * then we return false. + */ +bool +partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts) +{ + Assert(boundinfo != NULL); + + switch (boundinfo->strategy) + { + case PARTITION_STRATEGY_RANGE: + + /* + * RANGE-type partitioning guarantees that the partitions can be + * scanned in the order that they're defined in the PartitionDesc + * to provide sequential, non-overlapping ranges of tuples. + * However, if a DEFAULT partition exists then it doesn't work, as + * that could contain tuples from either below or above the + * defined range, or tuples belonging to gaps between partitions. + */ + if (!partition_bound_has_default(boundinfo)) + return true; + break; + + case PARTITION_STRATEGY_LIST: + + /* + * LIST partitioning can also guarantee ordering, but only if the + * partitions don't accept interleaved values. We could likely + * check for this by looping over the PartitionBound's indexes + * array to check that the indexes are in order. For now, let's + * just keep it simple and just accept LIST partitioning when + * there's no DEFAULT partition, exactly one value per partition, + * and optionally a NULL partition that does not accept any other + * values. Such a NULL partition will come last in the + * PartitionDesc, and the other partitions will be properly + * ordered. This is a cheap test to make as it does not require + * any per-partition processing. Maybe we'd like to handle more + * complex cases in the future. + */ + if (partition_bound_has_default(boundinfo)) + return false; + + if (boundinfo->ndatums + partition_bound_accepts_nulls(boundinfo) + == nparts) + return true; + break; + + default: + /* HASH, or some other strategy */ + break; + } + + return false; +} + /* * check_new_partition_bound * diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 15314a8cfe..f8429df9e1 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -280,6 +280,11 @@ struct PlannerInfo List *join_info_list; /* list of SpecialJoinInfos */ + /* + * Note: for AppendRelInfos describing partitions of a partitioned table, + * we guarantee that partitions that come earlier in the partitioned + * table's PartitionDesc will appear earlier in append_rel_list. + */ List *append_rel_list; /* list of AppendRelInfos */ List *rowMarks; /* list of PlanRowMarks */ @@ -1363,9 +1368,9 @@ typedef struct AppendPath /* RT indexes of non-leaf tables in a partition tree */ List *partitioned_rels; List *subpaths; /* list of component Paths */ - - /* Index of first partial path in subpaths */ + /* Index of first partial path in subpaths; list_length(subpaths) if none */ int first_partial_path; + double limit_tuples; /* hard limit on output tuples, or -1 */ } AppendPath; #define IS_DUMMY_APPEND(p) \ diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 5b577c12ec..437250f557 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -65,7 +65,7 @@ extern TidPath *create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals, Relids required_outer); extern AppendPath *create_append_path(PlannerInfo *root, RelOptInfo *rel, List *subpaths, List *partial_subpaths, - Relids required_outer, + List *pathkeys, Relids required_outer, int parallel_workers, bool parallel_aware, List *partitioned_rels, double rows); extern MergeAppendPath *create_merge_append_path(PlannerInfo *root, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 36d12bc376..0e858097c8 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -194,6 +194,8 @@ extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths, extern Path *get_cheapest_parallel_safe_total_inner(List *paths); extern List *build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index, ScanDirection scandir); +extern List *build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel, + ScanDirection scandir, bool *partialkeys); extern List *build_expression_pathkey(PlannerInfo *root, Expr *expr, Relids nullable_relids, Oid opno, Relids rel, bool create_it); diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h index 683e1574ea..c954965e4e 100644 --- a/src/include/partitioning/partbounds.h +++ b/src/include/partitioning/partbounds.h @@ -88,6 +88,7 @@ extern bool partition_bounds_equal(int partnatts, int16 *parttyplen, PartitionBoundInfo b2); extern PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src, PartitionKey key); +extern bool partitions_are_ordered(PartitionBoundInfo boundinfo, int nparts); extern void check_new_partition_bound(char *relname, Relation parent, PartitionBoundSpec *spec); extern void check_default_partition_contents(Relation parent, diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index 7518148df0..4725a15948 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -2037,7 +2037,6 @@ explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mc Filter: ((c > 20) AND (a = 20)) (9 rows) -drop table mcrparted; -- check that partitioned table Appends cope with being referenced in -- subplans create table parted_minmax (a int, b varchar(16)) partition by range (a); @@ -2065,3 +2064,255 @@ select min(a), max(a) from parted_minmax where b = '12345'; (1 row) drop table parted_minmax; +-- Test code that uses Append nodes in place of MergeAppend when the +-- partition ordering matches the desired ordering. +create index mcrparted_a_abs_c_idx on mcrparted (a, abs(b), c); +-- MergeAppend must be used when a default partition exists +explain (costs off) select * from mcrparted order by a, abs(b), c; + QUERY PLAN +------------------------------------------------------------------- + Merge Append + Sort Key: mcrparted0.a, (abs(mcrparted0.b)), mcrparted0.c + -> Index Scan using mcrparted0_a_abs_c_idx on mcrparted0 + -> Index Scan using mcrparted1_a_abs_c_idx on mcrparted1 + -> Index Scan using mcrparted2_a_abs_c_idx on mcrparted2 + -> Index Scan using mcrparted3_a_abs_c_idx on mcrparted3 + -> Index Scan using mcrparted4_a_abs_c_idx on mcrparted4 + -> Index Scan using mcrparted5_a_abs_c_idx on mcrparted5 + -> Index Scan using mcrparted_def_a_abs_c_idx on mcrparted_def +(9 rows) + +drop table mcrparted_def; +-- Append is used for a RANGE partitioned table with no default +-- and no subpartitions +explain (costs off) select * from mcrparted order by a, abs(b), c; + QUERY PLAN +------------------------------------------------------------- + Append + -> Index Scan using mcrparted0_a_abs_c_idx on mcrparted0 + -> Index Scan using mcrparted1_a_abs_c_idx on mcrparted1 + -> Index Scan using mcrparted2_a_abs_c_idx on mcrparted2 + -> Index Scan using mcrparted3_a_abs_c_idx on mcrparted3 + -> Index Scan using mcrparted4_a_abs_c_idx on mcrparted4 + -> Index Scan using mcrparted5_a_abs_c_idx on mcrparted5 +(7 rows) + +-- Append is used with subpaths in reverse order with backwards index scans +explain (costs off) select * from mcrparted order by a desc, abs(b) desc, c desc; + QUERY PLAN +---------------------------------------------------------------------- + Append + -> Index Scan Backward using mcrparted5_a_abs_c_idx on mcrparted5 + -> Index Scan Backward using mcrparted4_a_abs_c_idx on mcrparted4 + -> Index Scan Backward using mcrparted3_a_abs_c_idx on mcrparted3 + -> Index Scan Backward using mcrparted2_a_abs_c_idx on mcrparted2 + -> Index Scan Backward using mcrparted1_a_abs_c_idx on mcrparted1 + -> Index Scan Backward using mcrparted0_a_abs_c_idx on mcrparted0 +(7 rows) + +-- check that Append plan is used containing a MergeAppend for sub-partitions +-- that are unordered. +drop table mcrparted5; +create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue) partition by list (a); +create table mcrparted5a partition of mcrparted5 for values in(20); +create table mcrparted5_def partition of mcrparted5 default; +explain (costs off) select * from mcrparted order by a, abs(b), c; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Index Scan using mcrparted0_a_abs_c_idx on mcrparted0 + -> Index Scan using mcrparted1_a_abs_c_idx on mcrparted1 + -> Index Scan using mcrparted2_a_abs_c_idx on mcrparted2 + -> Index Scan using mcrparted3_a_abs_c_idx on mcrparted3 + -> Index Scan using mcrparted4_a_abs_c_idx on mcrparted4 + -> Merge Append + Sort Key: mcrparted5a.a, (abs(mcrparted5a.b)), mcrparted5a.c + -> Index Scan using mcrparted5a_a_abs_c_idx on mcrparted5a + -> Index Scan using mcrparted5_def_a_abs_c_idx on mcrparted5_def +(10 rows) + +drop table mcrparted5_def; +-- check that an Append plan is used and the sub-partitions are flattened +-- into the main Append when the sub-partition is unordered but contains +-- just a single sub-partition. +explain (costs off) select a, abs(b) from mcrparted order by a, abs(b), c; + QUERY PLAN +--------------------------------------------------------------- + Append + -> Index Scan using mcrparted0_a_abs_c_idx on mcrparted0 + -> Index Scan using mcrparted1_a_abs_c_idx on mcrparted1 + -> Index Scan using mcrparted2_a_abs_c_idx on mcrparted2 + -> Index Scan using mcrparted3_a_abs_c_idx on mcrparted3 + -> Index Scan using mcrparted4_a_abs_c_idx on mcrparted4 + -> Index Scan using mcrparted5a_a_abs_c_idx on mcrparted5a +(7 rows) + +-- check that Append is used when the sub-partitioned tables are pruned +-- during planning. +explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c; + QUERY PLAN +------------------------------------------------------------- + Append + -> Index Scan using mcrparted0_a_abs_c_idx on mcrparted0 + Index Cond: (a < 20) + -> Index Scan using mcrparted1_a_abs_c_idx on mcrparted1 + Index Cond: (a < 20) + -> Index Scan using mcrparted2_a_abs_c_idx on mcrparted2 + Index Cond: (a < 20) + -> Index Scan using mcrparted3_a_abs_c_idx on mcrparted3 + Index Cond: (a < 20) +(9 rows) + +create table mclparted (a int) partition by list(a); +create table mclparted1 partition of mclparted for values in(1); +create table mclparted2 partition of mclparted for values in(2); +create index on mclparted (a); +-- Ensure an Append is used for a list partition with an order by. +explain (costs off) select * from mclparted order by a; + QUERY PLAN +------------------------------------------------------------ + Append + -> Index Only Scan using mclparted1_a_idx on mclparted1 + -> Index Only Scan using mclparted2_a_idx on mclparted2 +(3 rows) + +-- Ensure a MergeAppend is used when a partition exists with interleaved +-- datums in the partition bound. +create table mclparted3_5 partition of mclparted for values in(3,5); +create table mclparted4 partition of mclparted for values in(4); +explain (costs off) select * from mclparted order by a; + QUERY PLAN +---------------------------------------------------------------- + Merge Append + Sort Key: mclparted1.a + -> Index Only Scan using mclparted1_a_idx on mclparted1 + -> Index Only Scan using mclparted2_a_idx on mclparted2 + -> Index Only Scan using mclparted3_5_a_idx on mclparted3_5 + -> Index Only Scan using mclparted4_a_idx on mclparted4 +(6 rows) + +drop table mclparted; +-- Ensure subplans which don't have a path with the correct pathkeys get +-- sorted correctly. +drop index mcrparted_a_abs_c_idx; +create index on mcrparted1 (a, abs(b), c); +create index on mcrparted2 (a, abs(b), c); +create index on mcrparted3 (a, abs(b), c); +create index on mcrparted4 (a, abs(b), c); +explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c limit 1; + QUERY PLAN +------------------------------------------------------------------------- + Limit + -> Append + -> Sort + Sort Key: mcrparted0.a, (abs(mcrparted0.b)), mcrparted0.c + -> Seq Scan on mcrparted0 + Filter: (a < 20) + -> Index Scan using mcrparted1_a_abs_c_idx on mcrparted1 + Index Cond: (a < 20) + -> Index Scan using mcrparted2_a_abs_c_idx on mcrparted2 + Index Cond: (a < 20) + -> Index Scan using mcrparted3_a_abs_c_idx on mcrparted3 + Index Cond: (a < 20) +(12 rows) + +set enable_bitmapscan = 0; +-- Ensure Append node can be used when the partition is ordered by some +-- pathkeys which were deemed redundant. +explain (costs off) select * from mcrparted where a = 10 order by a, abs(b), c; + QUERY PLAN +------------------------------------------------------------- + Append + -> Index Scan using mcrparted1_a_abs_c_idx on mcrparted1 + Index Cond: (a = 10) + -> Index Scan using mcrparted2_a_abs_c_idx on mcrparted2 + Index Cond: (a = 10) +(5 rows) + +reset enable_bitmapscan; +drop table mcrparted; +-- Ensure LIST partitions allow an Append to be used instead of a MergeAppend +create table bool_lp (b bool) partition by list(b); +create table bool_lp_true partition of bool_lp for values in(true); +create table bool_lp_false partition of bool_lp for values in(false); +create index on bool_lp (b); +explain (costs off) select * from bool_lp order by b; + QUERY PLAN +------------------------------------------------------------------ + Append + -> Index Only Scan using bool_lp_false_b_idx on bool_lp_false + -> Index Only Scan using bool_lp_true_b_idx on bool_lp_true +(3 rows) + +drop table bool_lp; +-- Ensure const bool quals can be properly detected as redundant +create table bool_rp (b bool, a int) partition by range(b,a); +create table bool_rp_false_1k partition of bool_rp for values from (false,0) to (false,1000); +create table bool_rp_true_1k partition of bool_rp for values from (true,0) to (true,1000); +create table bool_rp_false_2k partition of bool_rp for values from (false,1000) to (false,2000); +create table bool_rp_true_2k partition of bool_rp for values from (true,1000) to (true,2000); +create index on bool_rp (b,a); +explain (costs off) select * from bool_rp where b = true order by b,a; + QUERY PLAN +------------------------------------------------------------------------ + Append + -> Index Only Scan using bool_rp_true_1k_b_a_idx on bool_rp_true_1k + Index Cond: (b = true) + -> Index Only Scan using bool_rp_true_2k_b_a_idx on bool_rp_true_2k + Index Cond: (b = true) +(5 rows) + +explain (costs off) select * from bool_rp where b = false order by b,a; + QUERY PLAN +-------------------------------------------------------------------------- + Append + -> Index Only Scan using bool_rp_false_1k_b_a_idx on bool_rp_false_1k + Index Cond: (b = false) + -> Index Only Scan using bool_rp_false_2k_b_a_idx on bool_rp_false_2k + Index Cond: (b = false) +(5 rows) + +explain (costs off) select * from bool_rp where b = true order by a; + QUERY PLAN +------------------------------------------------------------------------ + Append + -> Index Only Scan using bool_rp_true_1k_b_a_idx on bool_rp_true_1k + Index Cond: (b = true) + -> Index Only Scan using bool_rp_true_2k_b_a_idx on bool_rp_true_2k + Index Cond: (b = true) +(5 rows) + +explain (costs off) select * from bool_rp where b = false order by a; + QUERY PLAN +-------------------------------------------------------------------------- + Append + -> Index Only Scan using bool_rp_false_1k_b_a_idx on bool_rp_false_1k + Index Cond: (b = false) + -> Index Only Scan using bool_rp_false_2k_b_a_idx on bool_rp_false_2k + Index Cond: (b = false) +(5 rows) + +drop table bool_rp; +-- Ensure an Append scan is chosen when the partition order is a subset of +-- the required order. +create table range_parted (a int, b int, c int) partition by range(a, b); +create table range_parted1 partition of range_parted for values from (0,0) to (10,10); +create table range_parted2 partition of range_parted for values from (10,10) to (20,20); +create index on range_parted (a,b,c); +explain (costs off) select * from range_parted order by a,b,c; + QUERY PLAN +---------------------------------------------------------------------- + Append + -> Index Only Scan using range_parted1_a_b_c_idx on range_parted1 + -> Index Only Scan using range_parted2_a_b_c_idx on range_parted2 +(3 rows) + +explain (costs off) select * from range_parted order by a desc,b desc,c desc; + QUERY PLAN +------------------------------------------------------------------------------- + Append + -> Index Only Scan Backward using range_parted2_a_b_c_idx on range_parted2 + -> Index Only Scan Backward using range_parted1_a_b_c_idx on range_parted1 +(3 rows) + +drop table range_parted; diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 7806ba1d47..0789b316eb 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -3078,14 +3078,14 @@ drop table boolp; -- set enable_seqscan = off; set enable_sort = off; -create table ma_test (a int) partition by range (a); +create table ma_test (a int, b int) partition by range (a); create table ma_test_p1 partition of ma_test for values from (0) to (10); create table ma_test_p2 partition of ma_test for values from (10) to (20); create table ma_test_p3 partition of ma_test for values from (20) to (30); -insert into ma_test select x from generate_series(0,29) t(x); -create index on ma_test (a); +insert into ma_test select x,x from generate_series(0,29) t(x); +create index on ma_test (b); analyze ma_test; -prepare mt_q1 (int) as select * from ma_test where a >= $1 and a % 10 = 5 order by a; +prepare mt_q1 (int) as select a from ma_test where a >= $1 and a % 10 = 5 order by b; -- Execute query 5 times to allow choose_custom_plan -- to start considering a generic plan. execute mt_q1(0); @@ -3132,17 +3132,15 @@ explain (analyze, costs off, summary off, timing off) execute mt_q1(15); QUERY PLAN ------------------------------------------------------------------------------- Merge Append (actual rows=2 loops=1) - Sort Key: ma_test_p2.a + Sort Key: ma_test_p2.b Subplans Removed: 1 - -> Index Scan using ma_test_p2_a_idx on ma_test_p2 (actual rows=1 loops=1) - Index Cond: (a >= $1) - Filter: ((a % 10) = 5) - Rows Removed by Filter: 4 - -> Index Scan using ma_test_p3_a_idx on ma_test_p3 (actual rows=1 loops=1) - Index Cond: (a >= $1) - Filter: ((a % 10) = 5) + -> Index Scan using ma_test_p2_b_idx on ma_test_p2 (actual rows=1 loops=1) + Filter: ((a >= $1) AND ((a % 10) = 5)) Rows Removed by Filter: 9 -(11 rows) + -> Index Scan using ma_test_p3_b_idx on ma_test_p3 (actual rows=1 loops=1) + Filter: ((a >= $1) AND ((a % 10) = 5)) + Rows Removed by Filter: 9 +(9 rows) execute mt_q1(15); a @@ -3155,13 +3153,12 @@ explain (analyze, costs off, summary off, timing off) execute mt_q1(25); QUERY PLAN ------------------------------------------------------------------------------- Merge Append (actual rows=1 loops=1) - Sort Key: ma_test_p3.a + Sort Key: ma_test_p3.b Subplans Removed: 2 - -> Index Scan using ma_test_p3_a_idx on ma_test_p3 (actual rows=1 loops=1) - Index Cond: (a >= $1) - Filter: ((a % 10) = 5) - Rows Removed by Filter: 4 -(7 rows) + -> Index Scan using ma_test_p3_b_idx on ma_test_p3 (actual rows=1 loops=1) + Filter: ((a >= $1) AND ((a % 10) = 5)) + Rows Removed by Filter: 9 +(6 rows) execute mt_q1(25); a @@ -3174,12 +3171,11 @@ explain (analyze, costs off, summary off, timing off) execute mt_q1(35); QUERY PLAN ------------------------------------------------------------------------ Merge Append (actual rows=0 loops=1) - Sort Key: ma_test_p1.a + Sort Key: ma_test_p1.b Subplans Removed: 2 - -> Index Scan using ma_test_p1_a_idx on ma_test_p1 (never executed) - Index Cond: (a >= $1) - Filter: ((a % 10) = 5) -(6 rows) + -> Index Scan using ma_test_p1_b_idx on ma_test_p1 (never executed) + Filter: ((a >= $1) AND ((a % 10) = 5)) +(5 rows) execute mt_q1(35); a @@ -3188,23 +3184,23 @@ execute mt_q1(35); deallocate mt_q1; -- ensure initplan params properly prune partitions -explain (analyze, costs off, summary off, timing off) select * from ma_test where a >= (select min(a) from ma_test_p2) order by a; +explain (analyze, costs off, summary off, timing off) select * from ma_test where a >= (select min(b) from ma_test_p2) order by b; QUERY PLAN ------------------------------------------------------------------------------------------------------------ Merge Append (actual rows=20 loops=1) - Sort Key: ma_test_p1.a + Sort Key: ma_test_p1.b InitPlan 2 (returns $1) -> Result (actual rows=1 loops=1) InitPlan 1 (returns $0) -> Limit (actual rows=1 loops=1) - -> Index Scan using ma_test_p2_a_idx on ma_test_p2 ma_test_p2_1 (actual rows=1 loops=1) - Index Cond: (a IS NOT NULL) - -> Index Scan using ma_test_p1_a_idx on ma_test_p1 (never executed) - Index Cond: (a >= $1) - -> Index Scan using ma_test_p2_a_idx on ma_test_p2 (actual rows=10 loops=1) - Index Cond: (a >= $1) - -> Index Scan using ma_test_p3_a_idx on ma_test_p3 (actual rows=10 loops=1) - Index Cond: (a >= $1) + -> Index Scan using ma_test_p2_b_idx on ma_test_p2 ma_test_p2_1 (actual rows=1 loops=1) + Index Cond: (b IS NOT NULL) + -> Index Scan using ma_test_p1_b_idx on ma_test_p1 (never executed) + Filter: (a >= $1) + -> Index Scan using ma_test_p2_b_idx on ma_test_p2 (actual rows=10 loops=1) + Filter: (a >= $1) + -> Index Scan using ma_test_p3_b_idx on ma_test_p3 (actual rows=10 loops=1) + Filter: (a >= $1) (14 rows) reset enable_seqscan; diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql index 5480fe7db4..e91f61bc37 100644 --- a/src/test/regress/sql/inherit.sql +++ b/src/test/regress/sql/inherit.sql @@ -715,7 +715,6 @@ explain (costs off) select * from mcrparted where abs(b) = 5; -- scans all parti explain (costs off) select * from mcrparted where a > -1; -- scans all partitions explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > 10; -- scans mcrparted4 explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def -drop table mcrparted; -- check that partitioned table Appends cope with being referenced in -- subplans @@ -726,3 +725,111 @@ insert into parted_minmax values (1,'12345'); explain (costs off) select min(a), max(a) from parted_minmax where b = '12345'; select min(a), max(a) from parted_minmax where b = '12345'; drop table parted_minmax; + +-- Test code that uses Append nodes in place of MergeAppend when the +-- partition ordering matches the desired ordering. + +create index mcrparted_a_abs_c_idx on mcrparted (a, abs(b), c); + +-- MergeAppend must be used when a default partition exists +explain (costs off) select * from mcrparted order by a, abs(b), c; + +drop table mcrparted_def; + +-- Append is used for a RANGE partitioned table with no default +-- and no subpartitions +explain (costs off) select * from mcrparted order by a, abs(b), c; + +-- Append is used with subpaths in reverse order with backwards index scans +explain (costs off) select * from mcrparted order by a desc, abs(b) desc, c desc; + +-- check that Append plan is used containing a MergeAppend for sub-partitions +-- that are unordered. +drop table mcrparted5; +create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue) partition by list (a); +create table mcrparted5a partition of mcrparted5 for values in(20); +create table mcrparted5_def partition of mcrparted5 default; + +explain (costs off) select * from mcrparted order by a, abs(b), c; + +drop table mcrparted5_def; + +-- check that an Append plan is used and the sub-partitions are flattened +-- into the main Append when the sub-partition is unordered but contains +-- just a single sub-partition. +explain (costs off) select a, abs(b) from mcrparted order by a, abs(b), c; + +-- check that Append is used when the sub-partitioned tables are pruned +-- during planning. +explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c; + +create table mclparted (a int) partition by list(a); +create table mclparted1 partition of mclparted for values in(1); +create table mclparted2 partition of mclparted for values in(2); +create index on mclparted (a); + +-- Ensure an Append is used for a list partition with an order by. +explain (costs off) select * from mclparted order by a; + +-- Ensure a MergeAppend is used when a partition exists with interleaved +-- datums in the partition bound. +create table mclparted3_5 partition of mclparted for values in(3,5); +create table mclparted4 partition of mclparted for values in(4); + +explain (costs off) select * from mclparted order by a; + +drop table mclparted; + +-- Ensure subplans which don't have a path with the correct pathkeys get +-- sorted correctly. +drop index mcrparted_a_abs_c_idx; +create index on mcrparted1 (a, abs(b), c); +create index on mcrparted2 (a, abs(b), c); +create index on mcrparted3 (a, abs(b), c); +create index on mcrparted4 (a, abs(b), c); + +explain (costs off) select * from mcrparted where a < 20 order by a, abs(b), c limit 1; + +set enable_bitmapscan = 0; +-- Ensure Append node can be used when the partition is ordered by some +-- pathkeys which were deemed redundant. +explain (costs off) select * from mcrparted where a = 10 order by a, abs(b), c; +reset enable_bitmapscan; + +drop table mcrparted; + +-- Ensure LIST partitions allow an Append to be used instead of a MergeAppend +create table bool_lp (b bool) partition by list(b); +create table bool_lp_true partition of bool_lp for values in(true); +create table bool_lp_false partition of bool_lp for values in(false); +create index on bool_lp (b); + +explain (costs off) select * from bool_lp order by b; + +drop table bool_lp; + +-- Ensure const bool quals can be properly detected as redundant +create table bool_rp (b bool, a int) partition by range(b,a); +create table bool_rp_false_1k partition of bool_rp for values from (false,0) to (false,1000); +create table bool_rp_true_1k partition of bool_rp for values from (true,0) to (true,1000); +create table bool_rp_false_2k partition of bool_rp for values from (false,1000) to (false,2000); +create table bool_rp_true_2k partition of bool_rp for values from (true,1000) to (true,2000); +create index on bool_rp (b,a); +explain (costs off) select * from bool_rp where b = true order by b,a; +explain (costs off) select * from bool_rp where b = false order by b,a; +explain (costs off) select * from bool_rp where b = true order by a; +explain (costs off) select * from bool_rp where b = false order by a; + +drop table bool_rp; + +-- Ensure an Append scan is chosen when the partition order is a subset of +-- the required order. +create table range_parted (a int, b int, c int) partition by range(a, b); +create table range_parted1 partition of range_parted for values from (0,0) to (10,10); +create table range_parted2 partition of range_parted for values from (10,10) to (20,20); +create index on range_parted (a,b,c); + +explain (costs off) select * from range_parted order by a,b,c; +explain (costs off) select * from range_parted order by a desc,b desc,c desc; + +drop table range_parted; diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index 2e4d2b483d..c30e58eef7 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -775,15 +775,15 @@ drop table boolp; -- set enable_seqscan = off; set enable_sort = off; -create table ma_test (a int) partition by range (a); +create table ma_test (a int, b int) partition by range (a); create table ma_test_p1 partition of ma_test for values from (0) to (10); create table ma_test_p2 partition of ma_test for values from (10) to (20); create table ma_test_p3 partition of ma_test for values from (20) to (30); -insert into ma_test select x from generate_series(0,29) t(x); -create index on ma_test (a); +insert into ma_test select x,x from generate_series(0,29) t(x); +create index on ma_test (b); analyze ma_test; -prepare mt_q1 (int) as select * from ma_test where a >= $1 and a % 10 = 5 order by a; +prepare mt_q1 (int) as select a from ma_test where a >= $1 and a % 10 = 5 order by b; -- Execute query 5 times to allow choose_custom_plan -- to start considering a generic plan. @@ -804,7 +804,7 @@ execute mt_q1(35); deallocate mt_q1; -- ensure initplan params properly prune partitions -explain (analyze, costs off, summary off, timing off) select * from ma_test where a >= (select min(a) from ma_test_p2) order by a; +explain (analyze, costs off, summary off, timing off) select * from ma_test where a >= (select min(b) from ma_test_p2) order by b; reset enable_seqscan; reset enable_sort; -- 2.40.0