From 4e23236403336052a000c8bf85720e9d65ba8036 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 11 Jun 2018 17:14:46 -0400 Subject: [PATCH] Improve commentary about run-time partition pruning data structures. No code changes except for a couple of new Asserts. David Rowley and Tom Lane Discussion: https://postgr.es/m/CAKJS1f-6GODRNgEtdPxCnAPme2h2hTztB6LmtfdmcYAAOE0kQg@mail.gmail.com --- src/backend/optimizer/path/allpaths.c | 3 +- src/backend/partitioning/partprune.c | 39 ++++++++++++++---------- src/include/executor/execPartition.h | 43 +++++++++++++-------------- src/include/nodes/plannodes.h | 27 +++++++++++++---- src/include/nodes/relation.h | 4 +-- 5 files changed, 71 insertions(+), 45 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 477b9f7fb8..146298828d 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1451,7 +1451,8 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, /* * If we need to build partitioned_rels, accumulate the partitioned - * rels for this child. + * rels for this child. We must ensure that parents are always listed + * before their child partitioned tables. */ if (build_partitioned_rels) { diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c index aaad35badb..856bdd3a14 100644 --- a/src/backend/partitioning/partprune.c +++ b/src/backend/partitioning/partprune.c @@ -202,12 +202,17 @@ make_partition_pruneinfo(PlannerInfo *root, List *partition_rels, int i; /* - * Allocate two arrays to store the 1-based indexes of the 'subpaths' and - * 'partitioned_rels' by relid. + * Construct two temporary arrays to map from planner relids to subplan + * and sub-partition indexes. For convenience, we use 1-based indexes + * here, so that zero can represent an un-filled array entry. */ relid_subplan_map = palloc0(sizeof(int) * root->simple_rel_array_size); relid_subpart_map = palloc0(sizeof(int) * root->simple_rel_array_size); + /* + * relid_subplan_map maps relid of a leaf partition to the index in + * 'subpaths' of the scan plan for that partition. + */ i = 1; foreach(lc, subpaths) { @@ -216,17 +221,27 @@ make_partition_pruneinfo(PlannerInfo *root, List *partition_rels, Assert(IS_SIMPLE_REL(pathrel)); Assert(pathrel->relid < root->simple_rel_array_size); + /* No duplicates please */ + Assert(relid_subplan_map[pathrel->relid] == 0); relid_subplan_map[pathrel->relid] = i++; } - /* Likewise for the partition_rels */ + /* + * relid_subpart_map maps relid of a non-leaf partition to the index in + * 'partition_rels' of that rel (which will also be the index in the + * returned PartitionPruneInfo list of the info for that partition). + */ i = 1; foreach(lc, partition_rels) { Index rti = lfirst_int(lc); Assert(rti < root->simple_rel_array_size); + /* No duplicates please */ + Assert(relid_subpart_map[rti] == 0); + /* Same rel cannot be both leaf and non-leaf */ + Assert(relid_subplan_map[rti] == 0); relid_subpart_map[rti] = i++; } @@ -287,16 +302,16 @@ make_partition_pruneinfo(PlannerInfo *root, List *partition_rels, return NIL; } + /* + * Construct the subplan and subpart maps for this partitioning level. + * Here we convert to zero-based indexes, with -1 for empty entries. + * Also construct a Bitmapset of all partitions that are present (that + * is, not pruned already). + */ subplan_map = (int *) palloc(nparts * sizeof(int)); subpart_map = (int *) palloc(nparts * sizeof(int)); present_parts = NULL; - /* - * Loop over each partition of the partitioned rel and record the - * subpath index for each. Any partitions which are not present in - * the subpaths List will be set to -1, and any sub-partitioned table - * which is not present will also be set to -1. - */ for (i = 0; i < nparts; i++) { RelOptInfo *partrel = subpart->part_rels[i]; @@ -305,12 +320,6 @@ make_partition_pruneinfo(PlannerInfo *root, List *partition_rels, subplan_map[i] = subplanidx; subpart_map[i] = subpartidx; - - /* - * Record the indexes of all the partition indexes that we have - * subplans or subparts for. This allows an optimization to skip - * attempting any run-time pruning when it's irrelevant. - */ if (subplanidx >= 0 || subpartidx >= 0) present_parts = bms_add_member(present_parts, i); } diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index 9f0b817c54..71d639fe65 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -113,30 +113,27 @@ typedef struct PartitionTupleRouting } PartitionTupleRouting; /*----------------------- - * PartitionPruningData - Encapsulates all information required to support - * elimination of partitions in plan types which support arbitrary Lists of - * subplans. Information stored here allows the partition pruning functions - * to be called and the return value of partition indexes translated into the - * subpath indexes of plan types such as Append, thus allowing us to bypass a - * subplan when we can prove that no tuple matching the 'pruning_steps' will - * be found within. + * PartitionPruningData - Per-partitioned-table data for run-time pruning + * of partitions. For a multilevel partitioned table, we have one of these + * for the topmost partition plus one for each non-leaf child partition, + * ordered such that parents appear before their children. * - * subplan_map An array containing the subplan index which - * matches this partition index, or -1 if the - * subplan has been pruned already. - * subpart_map An array containing the index into the - * partprunedata array in PartitionPruneState, or - * -1 if there is no such element in that array. + * subplan_map[] and subpart_map[] have the same definitions as in + * PartitionPruneInfo (see plannodes.h); though note that here, + * subpart_map contains indexes into PartitionPruneState.partprunedata[]. + * + * subplan_map Subplan index by partition index, or -1. + * subpart_map Subpart index by partition index, or -1. * present_parts A Bitmapset of the partition indexes that we - * have subplans mapped for. + * have subplans or subparts for. * context Contains the context details required to call * the partition pruning code. * pruning_steps List of PartitionPruneSteps used to * perform the actual pruning. * do_initial_prune true if pruning should be performed during - * executor startup. + * executor startup (for this partitioning level). * do_exec_prune true if pruning should be performed during - * executor run. + * executor run (for this partitioning level). *----------------------- */ typedef struct PartitionPruningData @@ -152,16 +149,18 @@ typedef struct PartitionPruningData /*----------------------- * PartitionPruneState - State object required for plan nodes to perform - * partition pruning elimination of their subplans. This encapsulates a - * flattened hierarchy of PartitionPruningData structs. + * run-time partition pruning. + * * This struct can be attached to plan types which support arbitrary Lists of * subplans containing partitions to allow subplans to be eliminated due to * the clauses being unable to match to any tuple that the subplan could - * possibly produce. + * possibly produce. Note that we currently support only one partitioned + * table per parent plan node, hence partprunedata[] need describe only one + * partitioning hierarchy. * - * partprunedata Array of PartitionPruningData for the plan's target - * partitioned relation. First element contains the - * details for the target partitioned table. + * partprunedata Array of PartitionPruningData for the plan's + * partitioned relation, ordered such that parent tables + * appear before children (hence, topmost table is first). * num_partprunedata Number of items in 'partprunedata' array. * do_initial_prune true if pruning should be performed during executor * startup (at any hierarchy level). diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index dacc50edc2..5201c6d4bc 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1059,18 +1059,35 @@ typedef struct PlanRowMark * plan types which support arbitrary numbers of subplans, such as Append. * We also store various details to tell the executor when it should be * performing partition pruning. + * + * Each PartitionPruneInfo describes the partitioning rules for a single + * partitioned table (a/k/a level of partitioning). For a multilevel + * partitioned table, we have a List of PartitionPruneInfos, where the + * first entry represents the topmost partitioned table and additional + * entries represent non-leaf child partitions, ordered such that parents + * appear before their children. + * + * subplan_map[] and subpart_map[] are indexed by partition index (where + * zero is the topmost partition, and non-leaf partitions must come before + * their children). For a leaf partition p, subplan_map[p] contains the + * zero-based index of the partition's subplan in the parent plan's subplan + * list; it is -1 if the partition is non-leaf or has been pruned. For a + * non-leaf partition p, subpart_map[p] contains the zero-based index of + * that sub-partition's PartitionPruneInfo in the plan's PartitionPruneInfo + * list; it is -1 if the partition is a leaf or has been pruned. All these + * indexes are global across the whole partitioned table and Append plan node. */ typedef struct PartitionPruneInfo { NodeTag type; - Oid reloid; /* Oid of partition rel */ + Oid reloid; /* OID of partition rel for this level */ List *pruning_steps; /* List of PartitionPruneStep, see below */ - Bitmapset *present_parts; /* Indexes of all partitions which subplans - * are present for. */ + Bitmapset *present_parts; /* Indexes of all partitions which subplans or + * subparts are present for. */ int nparts; /* Length of subplan_map[] and subpart_map[] */ int nexprs; /* Length of hasexecparam[] */ - int *subplan_map; /* subplan index by partition id, or -1 */ - int *subpart_map; /* subpart index by partition id, or -1 */ + int *subplan_map; /* subplan index by partition index, or -1 */ + int *subpart_map; /* subpart index by partition index, or -1 */ bool *hasexecparam; /* true if corresponding pruning_step contains * any PARAM_EXEC Params. */ bool do_initial_prune; /* true if pruning should be performed diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 3b28d1994f..5af484024a 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -552,8 +552,8 @@ typedef struct PartitionSchemeData *PartitionScheme; * part_rels - RelOptInfos for each partition * partexprs, nullable_partexprs - Partition key expressions * partitioned_child_rels - RT indexes of unpruned partitions of - * relation that are partitioned tables - * themselves + * this relation that are partitioned tables + * themselves, in hierarchical order * * Note: A base relation always has only one set of partition keys, but a join * relation may have as many sets of partition keys as the number of relations -- 2.40.0