]> granicus.if.org Git - postgresql/commitdiff
Redesign initialization of partition routing structures
authorAlvaro Herrera <alvherre@alvh.no-ip.org>
Fri, 16 Nov 2018 17:54:15 +0000 (14:54 -0300)
committerAlvaro Herrera <alvherre@alvh.no-ip.org>
Fri, 16 Nov 2018 18:01:05 +0000 (15:01 -0300)
This speeds up write operations (INSERT, UPDATE, DELETE, COPY, as well
as the future MERGE) on partitioned tables.

This changes the setup for tuple routing so that it does far less work
during the initial setup and pushes more work out to when partitions
receive tuples.  PartitionDispatchData structs for sub-partitioned
tables are only created when a tuple gets routed through it.  The
possibly large arrays in the PartitionTupleRouting struct have largely
been removed.  The partitions[] array remains but now never contains any
NULL gaps.  Previously the NULLs had to be skipped during
ExecCleanupTupleRouting(), which could add a large overhead to the
cleanup when the number of partitions was large.  The partitions[] array
is allocated small to start with and only enlarged when we route tuples
to enough partitions that it runs out of space. This allows us to keep
simple single-row partition INSERTs running quickly.  Redesign

The arrays in PartitionTupleRouting which stored the tuple translation maps
have now been removed.  These have been moved out into a
PartitionRoutingInfo struct which is an additional field in ResultRelInfo.

The find_all_inheritors() call still remains by far the slowest part of
ExecSetupPartitionTupleRouting(). This commit just removes the other slow
parts.

In passing also rename the tuple translation maps from being ParentToChild
and ChildToParent to being RootToPartition and PartitionToRoot. The old
names mislead you into thinking that a partition of some sub-partitioned
table would translate to the rowtype of the sub-partitioned table rather
than the root partitioned table.

Authors: David Rowley and Amit Langote, heavily revised by Álvaro Herrera
Testing help from Jesper Pedersen and Kato Sho.
Discussion: https://postgr.es/m/CAKJS1f_1RJyFquuCKRFHTdcXqoPX-PYqAd7nz=GVBwvGh4a6xA@mail.gmail.com

src/backend/commands/copy.c
src/backend/executor/execMain.c
src/backend/executor/execPartition.c
src/backend/executor/nodeModifyTable.c
src/backend/optimizer/prep/prepunion.c
src/backend/utils/cache/partcache.c
src/include/catalog/partition.h
src/include/executor/execPartition.h
src/include/nodes/execnodes.h

index e62e3d8fba2029467681fb8fcb2232e79ae0de7f..6588ebd6dc001ed8d63affb1eb6fff71b4b8ea98 100644 (file)
@@ -2316,6 +2316,7 @@ CopyFrom(CopyState cstate)
        bool       *nulls;
        ResultRelInfo *resultRelInfo;
        ResultRelInfo *target_resultRelInfo;
+       ResultRelInfo *prevResultRelInfo = NULL;
        EState     *estate = CreateExecutorState(); /* for ExecConstraints() */
        ModifyTableState *mtstate;
        ExprContext *econtext;
@@ -2331,7 +2332,6 @@ CopyFrom(CopyState cstate)
        CopyInsertMethod insertMethod;
        uint64          processed = 0;
        int                     nBufferedTuples = 0;
-       int                     prev_leaf_part_index = -1;
        bool            has_before_insert_row_trig;
        bool            has_instead_insert_row_trig;
        bool            leafpart_use_multi_insert = false;
@@ -2515,8 +2515,12 @@ CopyFrom(CopyState cstate)
        /*
         * If there are any triggers with transition tables on the named relation,
         * we need to be prepared to capture transition tuples.
+        *
+        * Because partition tuple routing would like to know about whether
+        * transition capture is active, we also set it in mtstate, which is
+        * passed to ExecFindPartition() below.
         */
-       cstate->transition_capture =
+       cstate->transition_capture = mtstate->mt_transition_capture =
                MakeTransitionCaptureState(cstate->rel->trigdesc,
                                                                   RelationGetRelid(cstate->rel),
                                                                   CMD_INSERT);
@@ -2526,19 +2530,8 @@ CopyFrom(CopyState cstate)
         * CopyFrom tuple routing.
         */
        if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-       {
                proute = ExecSetupPartitionTupleRouting(NULL, cstate->rel);
 
-               /*
-                * If we are capturing transition tuples, they may need to be
-                * converted from partition format back to partitioned table format
-                * (this is only ever necessary if a BEFORE trigger modifies the
-                * tuple).
-                */
-               if (cstate->transition_capture != NULL)
-                       ExecSetupChildParentMapForLeaf(proute);
-       }
-
        /*
         * It's more efficient to prepare a bunch of tuples for insertion, and
         * insert them in one heap_multi_insert() call, than call heap_insert()
@@ -2694,25 +2687,17 @@ CopyFrom(CopyState cstate)
                /* Determine the partition to heap_insert the tuple into */
                if (proute)
                {
-                       int                     leaf_part_index;
                        TupleConversionMap *map;
 
                        /*
-                        * Away we go ... If we end up not finding a partition after all,
-                        * ExecFindPartition() does not return and errors out instead.
-                        * Otherwise, the returned value is to be used as an index into
-                        * arrays mt_partitions[] and mt_partition_tupconv_maps[] that
-                        * will get us the ResultRelInfo and TupleConversionMap for the
-                        * partition, respectively.
+                        * Attempt to find a partition suitable for this tuple.
+                        * ExecFindPartition() will raise an error if none can be found or
+                        * if the found partition is not suitable for INSERTs.
                         */
-                       leaf_part_index = ExecFindPartition(target_resultRelInfo,
-                                                                                               proute->partition_dispatch_info,
-                                                                                               slot,
-                                                                                               estate);
-                       Assert(leaf_part_index >= 0 &&
-                                  leaf_part_index < proute->num_partitions);
-
-                       if (prev_leaf_part_index != leaf_part_index)
+                       resultRelInfo = ExecFindPartition(mtstate, target_resultRelInfo,
+                                                                                         proute, slot, estate);
+
+                       if (prevResultRelInfo != resultRelInfo)
                        {
                                /* Check if we can multi-insert into this partition */
                                if (insertMethod == CIM_MULTI_CONDITIONAL)
@@ -2725,12 +2710,9 @@ CopyFrom(CopyState cstate)
                                        if (nBufferedTuples > 0)
                                        {
                                                ExprContext *swapcontext;
-                                               ResultRelInfo *presultRelInfo;
-
-                                               presultRelInfo = proute->partitions[prev_leaf_part_index];
 
                                                CopyFromInsertBatch(cstate, estate, mycid, hi_options,
-                                                                                       presultRelInfo, myslot, bistate,
+                                                                                       prevResultRelInfo, myslot, bistate,
                                                                                        nBufferedTuples, bufferedTuples,
                                                                                        firstBufferedLineNo);
                                                nBufferedTuples = 0;
@@ -2787,21 +2769,6 @@ CopyFrom(CopyState cstate)
                                        }
                                }
 
-                               /*
-                                * Overwrite resultRelInfo with the corresponding partition's
-                                * one.
-                                */
-                               resultRelInfo = proute->partitions[leaf_part_index];
-                               if (unlikely(resultRelInfo == NULL))
-                               {
-                                       resultRelInfo = ExecInitPartitionInfo(mtstate,
-                                                                                                                 target_resultRelInfo,
-                                                                                                                 proute, estate,
-                                                                                                                 leaf_part_index);
-                                       proute->partitions[leaf_part_index] = resultRelInfo;
-                                       Assert(resultRelInfo != NULL);
-                               }
-
                                /* Determine which triggers exist on this partition */
                                has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc &&
                                                                                          resultRelInfo->ri_TrigDesc->trig_insert_before_row);
@@ -2827,7 +2794,7 @@ CopyFrom(CopyState cstate)
                                 * buffer when the partition being inserted into changes.
                                 */
                                ReleaseBulkInsertStatePin(bistate);
-                               prev_leaf_part_index = leaf_part_index;
+                               prevResultRelInfo = resultRelInfo;
                        }
 
                        /*
@@ -2837,7 +2804,7 @@ CopyFrom(CopyState cstate)
 
                        /*
                         * If we're capturing transition tuples, we might need to convert
-                        * from the partition rowtype to parent rowtype.
+                        * from the partition rowtype to root rowtype.
                         */
                        if (cstate->transition_capture != NULL)
                        {
@@ -2850,8 +2817,7 @@ CopyFrom(CopyState cstate)
                                         */
                                        cstate->transition_capture->tcs_original_insert_tuple = NULL;
                                        cstate->transition_capture->tcs_map =
-                                               TupConvMapForLeaf(proute, target_resultRelInfo,
-                                                                                 leaf_part_index);
+                                               resultRelInfo->ri_PartitionInfo->pi_PartitionToRootMap;
                                }
                                else
                                {
@@ -2865,18 +2831,18 @@ CopyFrom(CopyState cstate)
                        }
 
                        /*
-                        * We might need to convert from the parent rowtype to the
-                        * partition rowtype.
+                        * We might need to convert from the root rowtype to the partition
+                        * rowtype.
                         */
-                       map = proute->parent_child_tupconv_maps[leaf_part_index];
+                       map = resultRelInfo->ri_PartitionInfo->pi_RootToPartitionMap;
                        if (map != NULL)
                        {
                                TupleTableSlot *new_slot;
                                MemoryContext oldcontext;
 
-                               Assert(proute->partition_tuple_slots != NULL &&
-                                          proute->partition_tuple_slots[leaf_part_index] != NULL);
-                               new_slot = proute->partition_tuple_slots[leaf_part_index];
+                               new_slot = resultRelInfo->ri_PartitionInfo->pi_PartitionTupleSlot;
+                               Assert(new_slot != NULL);
+
                                slot = execute_attr_map_slot(map->attrMap, slot, new_slot);
 
                                /*
@@ -3021,12 +2987,8 @@ CopyFrom(CopyState cstate)
        {
                if (insertMethod == CIM_MULTI_CONDITIONAL)
                {
-                       ResultRelInfo *presultRelInfo;
-
-                       presultRelInfo = proute->partitions[prev_leaf_part_index];
-
                        CopyFromInsertBatch(cstate, estate, mycid, hi_options,
-                                                               presultRelInfo, myslot, bistate,
+                                                               prevResultRelInfo, myslot, bistate,
                                                                nBufferedTuples, bufferedTuples,
                                                                firstBufferedLineNo);
                }
index 74398eb46433c8f744e22c20c84f915e6460c4d2..757df0705d80545c3f2110de732c506ad2301140 100644 (file)
@@ -1345,7 +1345,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
 
        resultRelInfo->ri_PartitionCheck = partition_check;
        resultRelInfo->ri_PartitionRoot = partition_root;
-       resultRelInfo->ri_PartitionReadyForRouting = false;
+       resultRelInfo->ri_PartitionInfo = NULL; /* may be set later */
 }
 
 /*
index e11fe68712175ca9073ac71f4fb876c7dbf9b2b5..e3cb4fb1bee24192ee88a9d2c9cbf8b54f5c4953 100644 (file)
 #include "utils/ruleutils.h"
 
 
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to
+ * route a tuple inserted into a partitioned table to one of its leaf
+ * partitions.
+ *
+ * partition_root
+ *             The partitioned table that's the target of the command.
+ *
+ * partition_dispatch_info
+ *             Array of 'max_dispatch' elements containing a pointer to a
+ *             PartitionDispatch object for every partitioned table touched by tuple
+ *             routing.  The entry for the target partitioned table is *always*
+ *             present in the 0th element of this array.  See comment for
+ *             PartitionDispatchData->indexes for details on how this array is
+ *             indexed.
+ *
+ * num_dispatch
+ *             The current number of items stored in the 'partition_dispatch_info'
+ *             array.  Also serves as the index of the next free array element for
+ *             new PartitionDispatch objects that need to be stored.
+ *
+ * max_dispatch
+ *             The current allocated size of the 'partition_dispatch_info' array.
+ *
+ * partitions
+ *             Array of 'max_partitions' elements containing a pointer to a
+ *             ResultRelInfo for every leaf partitions touched by tuple routing.
+ *             Some of these are pointers to ResultRelInfos which are borrowed out of
+ *             'subplan_resultrel_htab'.  The remainder have been built especially
+ *             for tuple routing.  See comment for PartitionDispatchData->indexes for
+ *             details on how this array is indexed.
+ *
+ * num_partitions
+ *             The current number of items stored in the 'partitions' array.  Also
+ *             serves as the index of the next free array element for new
+ *             ResultRelInfo objects that need to be stored.
+ *
+ * max_partitions
+ *             The current allocated size of the 'partitions' array.
+ *
+ * subplan_resultrel_htab
+ *             Hash table to store subplan ResultRelInfos by Oid.  This is used to
+ *             cache ResultRelInfos from subplans of an UPDATE ModifyTable node;
+ *             NULL in other cases.  Some of these may be useful for tuple routing
+ *             to save having to build duplicates.
+ *
+ * memcxt
+ *             Memory context used to allocate subsidiary structs.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+       Relation        partition_root;
+       PartitionDispatch *partition_dispatch_info;
+       int                     num_dispatch;
+       int                     max_dispatch;
+       ResultRelInfo **partitions;
+       int                     num_partitions;
+       int                     max_partitions;
+       HTAB       *subplan_resultrel_htab;
+       MemoryContext memcxt;
+} PartitionTupleRouting;
+
 /*-----------------------
  * PartitionDispatch - information about one partitioned table in a partition
- * hierarchy required to route a tuple to one of its partitions
+ * hierarchy required to route a tuple to any of its partitions.  A
+ * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
+ * struct and stored inside its 'partition_dispatch_info' array.
  *
- *     reldesc         Relation descriptor of the table
- *     key                     Partition key information of the table
- *     keystate        Execution state required for expressions in the partition key
- *     partdesc        Partition descriptor of the table
- *     tupslot         A standalone TupleTableSlot initialized with this table's tuple
- *                             descriptor
- *     tupmap          TupleConversionMap to convert from the parent's rowtype to
- *                             this table's rowtype (when extracting the partition key of a
- *                             tuple just before routing it through this table)
- *     indexes         Array with partdesc->nparts members (for details on what
- *                             individual members represent, see how they are set in
- *                             get_partition_dispatch_recurse())
+ * reldesc
+ *             Relation descriptor of the table
+ * key
+ *             Partition key information of the table
+ * keystate
+ *             Execution state required for expressions in the partition key
+ * partdesc
+ *             Partition descriptor of the table
+ * tupslot
+ *             A standalone TupleTableSlot initialized with this table's tuple
+ *             descriptor, or NULL if no tuple conversion between the parent is
+ *             required.
+ * tupmap
+ *             TupleConversionMap to convert from the parent's rowtype to this table's
+ *             rowtype  (when extracting the partition key of a tuple just before
+ *             routing it through this table). A NULL value is stored if no tuple
+ *             conversion is required.
+ * indexes
+ *             Array of partdesc->nparts elements.  For leaf partitions the index
+ *             corresponds to the partition's ResultRelInfo in the encapsulating
+ *             PartitionTupleRouting's partitions array.  For partitioned partitions,
+ *             the index corresponds to the PartitionDispatch for it in its
+ *             partition_dispatch_info array.  -1 indicates we've not yet allocated
+ *             anything in PartitionTupleRouting for the partition.
  *-----------------------
  */
 typedef struct PartitionDispatchData
@@ -58,14 +135,32 @@ typedef struct PartitionDispatchData
        PartitionDesc partdesc;
        TupleTableSlot *tupslot;
        AttrNumber *tupmap;
-       int                *indexes;
+       int                     indexes[FLEXIBLE_ARRAY_MEMBER];
 } PartitionDispatchData;
 
-
-static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
-                                                                int *num_parted, List **leaf_part_oids);
-static void get_partition_dispatch_recurse(Relation rel, Relation parent,
-                                                          List **pds, List **leaf_part_oids);
+/* struct to hold result relations coming from UPDATE subplans */
+typedef struct SubplanResultRelHashElem
+{
+       Oid             relid;          /* hash key -- must be first */
+       ResultRelInfo *rri;
+} SubplanResultRelHashElem;
+
+
+static void ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate,
+                                                          PartitionTupleRouting *proute);
+static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
+                                         EState *estate, PartitionTupleRouting *proute,
+                                         PartitionDispatch dispatch,
+                                         ResultRelInfo *rootResultRelInfo,
+                                         int partidx);
+static void ExecInitRoutingInfo(ModifyTableState *mtstate,
+                                       EState *estate,
+                                       PartitionTupleRouting *proute,
+                                       PartitionDispatch dispatch,
+                                       ResultRelInfo *partRelInfo,
+                                       int partidx);
+static PartitionDispatch ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute,
+                                                         Oid partoid, PartitionDispatch parent_pd, int partidx);
 static void FormPartitionKeyDatum(PartitionDispatch pd,
                                          TupleTableSlot *slot,
                                          EState *estate,
@@ -92,131 +187,87 @@ static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
  * Note that all the relations in the partition tree are locked using the
  * RowExclusiveLock mode upon return from this function.
  *
- * While we allocate the arrays of pointers of ResultRelInfo and
- * TupleConversionMap for all partitions here, actual objects themselves are
- * lazily allocated for a given partition if a tuple is actually routed to it;
- * see ExecInitPartitionInfo.  However, if the function is invoked for update
- * tuple routing, caller would already have initialized ResultRelInfo's for
- * some of the partitions, which are reused and assigned to their respective
- * slot in the aforementioned array.  For such partitions, we delay setting
- * up objects such as TupleConversionMap until those are actually chosen as
- * the partitions to route tuples to.  See ExecPrepareTupleRouting.
+ * Callers must use the returned PartitionTupleRouting during calls to
+ * ExecFindPartition().  The actual ResultRelInfo for a partition is only
+ * allocated when the partition is found for the first time.
+ *
+ * The current memory context is used to allocate this struct and all
+ * subsidiary structs that will be allocated from it later on.  Typically
+ * it should be estate->es_query_cxt.
  */
 PartitionTupleRouting *
 ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 {
-       List       *leaf_parts;
-       ListCell   *cell;
-       int                     i;
-       ResultRelInfo *update_rri = NULL;
-       int                     num_update_rri = 0,
-                               update_rri_index = 0;
        PartitionTupleRouting *proute;
-       int                     nparts;
        ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
 
+       /* Lock all the partitions. */
+       (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
+
        /*
-        * Get the information about the partition tree after locking all the
-        * partitions.
+        * Here we attempt to expend as little effort as possible in setting up
+        * the PartitionTupleRouting.  Each partition's ResultRelInfo is built on
+        * demand, only when we actually need to route a tuple to that partition.
+        * The reason for this is that a common case is for INSERT to insert a
+        * single tuple into a partitioned table and this must be fast.
         */
-       (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
        proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
-       proute->partition_dispatch_info =
-               RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch,
-                                                                                &leaf_parts);
-       proute->num_partitions = nparts = list_length(leaf_parts);
-       proute->partitions =
-               (ResultRelInfo **) palloc(nparts * sizeof(ResultRelInfo *));
-       proute->parent_child_tupconv_maps =
-               (TupleConversionMap **) palloc0(nparts * sizeof(TupleConversionMap *));
-       proute->partition_oids = (Oid *) palloc(nparts * sizeof(Oid));
-
-       /* Set up details specific to the type of tuple routing we are doing. */
-       if (node && node->operation == CMD_UPDATE)
-       {
-               update_rri = mtstate->resultRelInfo;
-               num_update_rri = list_length(node->plans);
-               proute->subplan_partition_offsets =
-                       palloc(num_update_rri * sizeof(int));
-               proute->num_subplan_partition_offsets = num_update_rri;
-
-               /*
-                * We need an additional tuple slot for storing transient tuples that
-                * are converted to the root table descriptor.
-                */
-               proute->root_tuple_slot = MakeTupleTableSlot(RelationGetDescr(rel),
-                                                                                                        &TTSOpsHeapTuple);
-       }
-
-       i = 0;
-       foreach(cell, leaf_parts)
-       {
-               ResultRelInfo *leaf_part_rri = NULL;
-               Oid                     leaf_oid = lfirst_oid(cell);
-
-               proute->partition_oids[i] = leaf_oid;
-
-               /*
-                * If the leaf partition is already present in the per-subplan result
-                * rels, we re-use that rather than initialize a new result rel. The
-                * per-subplan resultrels and the resultrels of the leaf partitions
-                * are both in the same canonical order. So while going through the
-                * leaf partition oids, we need to keep track of the next per-subplan
-                * result rel to be looked for in the leaf partition resultrels.
-                */
-               if (update_rri_index < num_update_rri &&
-                       RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
-               {
-                       leaf_part_rri = &update_rri[update_rri_index];
-
-                       /*
-                        * This is required in order to convert the partition's tuple to
-                        * be compatible with the root partitioned table's tuple
-                        * descriptor.  When generating the per-subplan result rels, this
-                        * was not set.
-                        */
-                       leaf_part_rri->ri_PartitionRoot = rel;
-
-                       /* Remember the subplan offset for this ResultRelInfo */
-                       proute->subplan_partition_offsets[update_rri_index] = i;
+       proute->partition_root = rel;
+       proute->memcxt = CurrentMemoryContext;
+       /* Rest of members initialized by zeroing */
 
-                       update_rri_index++;
-               }
-
-               proute->partitions[i] = leaf_part_rri;
-               i++;
-       }
+       /*
+        * Initialize this table's PartitionDispatch object.  Here we pass in the
+        * parent as NULL as we don't need to care about any parent of the target
+        * partitioned table.
+        */
+       ExecInitPartitionDispatchInfo(proute, RelationGetRelid(rel), NULL, 0);
 
        /*
-        * For UPDATE, we should have found all the per-subplan resultrels in the
-        * leaf partitions.  (If this is an INSERT, both values will be zero.)
+        * If performing an UPDATE with tuple routing, we can reuse partition
+        * sub-plan result rels.  We build a hash table to map the OIDs of
+        * partitions present in mtstate->resultRelInfo to their ResultRelInfos.
+        * Every time a tuple is routed to a partition that we've yet to set the
+        * ResultRelInfo for, before we go to the trouble of making one, we check
+        * for a pre-made one in the hash table.
         */
-       Assert(update_rri_index == num_update_rri);
+       if (node && node->operation == CMD_UPDATE)
+               ExecHashSubPlanResultRelsByOid(mtstate, proute);
 
        return proute;
 }
 
 /*
- * ExecFindPartition -- Find a leaf partition in the partition tree rooted
- * at parent, for the heap tuple contained in *slot
+ * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
+ * the tuple contained in *slot should belong to.
+ *
+ * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
+ * one up or reuse one from mtstate's resultRelInfo array.  When reusing a
+ * ResultRelInfo from the mtstate we verify that the relation is a valid
+ * target for INSERTs and then set up a PartitionRoutingInfo for it.
+ *
+ * rootResultRelInfo is the relation named in the query.
  *
  * estate must be non-NULL; we'll need it to compute any expressions in the
- * partition key(s)
+ * partition keys.  Also, its per-tuple contexts are used as evaluation
+ * scratch space.
  *
  * If no leaf partition is found, this routine errors out with the appropriate
- * error message, else it returns the leaf partition sequence number
- * as an index into the array of (ResultRelInfos of) all leaf partitions in
- * the partition tree.
+ * error message.  An error may also raised if the found target partition is
+ * not a valid target for an INSERT.
  */
-int
-ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
+ResultRelInfo *
+ExecFindPartition(ModifyTableState *mtstate,
+                                 ResultRelInfo *rootResultRelInfo,
+                                 PartitionTupleRouting *proute,
                                  TupleTableSlot *slot, EState *estate)
 {
-       int                     result;
+       PartitionDispatch *pd = proute->partition_dispatch_info;
        Datum           values[PARTITION_MAX_KEYS];
        bool            isnull[PARTITION_MAX_KEYS];
        Relation        rel;
        PartitionDispatch dispatch;
+       PartitionDesc partdesc;
        ExprContext *ecxt = GetPerTupleExprContext(estate);
        TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
        TupleTableSlot *myslot = NULL;
@@ -229,25 +280,31 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
         * First check the root table's partition constraint, if any.  No point in
         * routing the tuple if it doesn't belong in the root table itself.
         */
-       if (resultRelInfo->ri_PartitionCheck)
-               ExecPartitionCheck(resultRelInfo, slot, estate, true);
+       if (rootResultRelInfo->ri_PartitionCheck)
+               ExecPartitionCheck(rootResultRelInfo, slot, estate, true);
 
        /* start with the root partitioned table */
        dispatch = pd[0];
        while (true)
        {
                AttrNumber *map = dispatch->tupmap;
-               int                     cur_index = -1;
+               int                     partidx = -1;
+
+               CHECK_FOR_INTERRUPTS();
 
                rel = dispatch->reldesc;
+               partdesc = dispatch->partdesc;
 
                /*
                 * Convert the tuple to this parent's layout, if different from the
                 * current relation.
                 */
                myslot = dispatch->tupslot;
-               if (myslot != NULL && map != NULL)
+               if (myslot != NULL)
+               {
+                       Assert(map != NULL);
                        slot = execute_attr_map_slot(map, slot, myslot);
+               }
 
                /*
                 * Extract partition key from tuple. Expression evaluation machinery
@@ -261,97 +318,196 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
                FormPartitionKeyDatum(dispatch, slot, estate, values, isnull);
 
                /*
-                * Nothing for get_partition_for_tuple() to do if there are no
-                * partitions to begin with.
+                * If this partitioned table has no partitions or no partition for
+                * these values, error out.
                 */
-               if (dispatch->partdesc->nparts == 0)
+               if (partdesc->nparts == 0 ||
+                       (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
                {
-                       result = -1;
-                       break;
+                       char       *val_desc;
+
+                       val_desc = ExecBuildSlotPartitionKeyDescription(rel,
+                                                                                                                       values, isnull, 64);
+                       Assert(OidIsValid(RelationGetRelid(rel)));
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_CHECK_VIOLATION),
+                                        errmsg("no partition of relation \"%s\" found for row",
+                                                       RelationGetRelationName(rel)),
+                                        val_desc ?
+                                        errdetail("Partition key of the failing row contains %s.",
+                                                          val_desc) : 0));
                }
 
-               cur_index = get_partition_for_tuple(dispatch, values, isnull);
-
-               /*
-                * cur_index < 0 means we failed to find a partition of this parent.
-                * cur_index >= 0 means we either found the leaf partition, or the
-                * next parent to find a partition of.
-                */
-               if (cur_index < 0)
+               if (partdesc->is_leaf[partidx])
                {
-                       result = -1;
-                       break;
-               }
-               else if (dispatch->indexes[cur_index] >= 0)
-               {
-                       result = dispatch->indexes[cur_index];
-                       /* success! */
-                       break;
+                       ResultRelInfo *rri;
+
+                       /*
+                        * Look to see if we've already got a ResultRelInfo for this
+                        * partition.
+                        */
+                       if (likely(dispatch->indexes[partidx] >= 0))
+                       {
+                               /* ResultRelInfo already built */
+                               Assert(dispatch->indexes[partidx] < proute->num_partitions);
+                               rri = proute->partitions[dispatch->indexes[partidx]];
+                       }
+                       else
+                       {
+                               bool            found = false;
+
+                               /*
+                                * We have not yet set up a ResultRelInfo for this partition,
+                                * but if we have a subplan hash table, we might have one
+                                * there.  If not, we'll have to create one.
+                                */
+                               if (proute->subplan_resultrel_htab)
+                               {
+                                       Oid                     partoid = partdesc->oids[partidx];
+                                       SubplanResultRelHashElem   *elem;
+
+                                       elem = hash_search(proute->subplan_resultrel_htab,
+                                                                          &partoid, HASH_FIND, NULL);
+                                       if (elem)
+                                       {
+                                               found = true;
+                                               rri = elem->rri;
+
+                                               /* Verify this ResultRelInfo allows INSERTs */
+                                               CheckValidResultRel(rri, CMD_INSERT);
+
+                                               /* Set up the PartitionRoutingInfo for it */
+                                               ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
+                                                                                       rri, partidx);
+                                       }
+                               }
+
+                               /* We need to create a new one. */
+                               if (!found)
+                                       rri = ExecInitPartitionInfo(mtstate, estate, proute,
+                                                                                               dispatch,
+                                                                                               rootResultRelInfo, partidx);
+                       }
+
+                       /* Release the tuple in the lowest parent's dedicated slot. */
+                       if (slot == myslot)
+                               ExecClearTuple(myslot);
+
+                       MemoryContextSwitchTo(oldcxt);
+                       ecxt->ecxt_scantuple = ecxt_scantuple_old;
+                       return rri;
                }
                else
                {
-                       /* move down one level */
-                       dispatch = pd[-dispatch->indexes[cur_index]];
+                       /*
+                        * Partition is a sub-partitioned table; get the PartitionDispatch
+                        */
+                       if (likely(dispatch->indexes[partidx] >= 0))
+                       {
+                               /* Already built. */
+                               Assert(dispatch->indexes[partidx] < proute->num_dispatch);
+
+                               /*
+                                * Move down to the next partition level and search again
+                                * until we find a leaf partition that matches this tuple
+                                */
+                               dispatch = pd[dispatch->indexes[partidx]];
+                       }
+                       else
+                       {
+                               /* Not yet built. Do that now. */
+                               PartitionDispatch subdispatch;
+
+                               /*
+                                * Create the new PartitionDispatch.  We pass the current one
+                                * in as the parent PartitionDispatch
+                                */
+                               subdispatch = ExecInitPartitionDispatchInfo(proute,
+                                                                                                                       partdesc->oids[partidx],
+                                                                                                                       dispatch, partidx);
+                               Assert(dispatch->indexes[partidx] >= 0 &&
+                                          dispatch->indexes[partidx] < proute->num_dispatch);
+                               dispatch = subdispatch;
+                       }
                }
        }
+}
+
+/*
+ * ExecHashSubPlanResultRelsByOid
+ *             Build a hash table to allow fast lookups of subplan ResultRelInfos by
+ *             partition Oid.  We also populate the subplan ResultRelInfo with an
+ *             ri_PartitionRoot.
+ */
+static void
+ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate,
+                                                          PartitionTupleRouting *proute)
+{
+       HASHCTL         ctl;
+       HTAB       *htab;
+       int                     i;
+
+       memset(&ctl, 0, sizeof(ctl));
+       ctl.keysize = sizeof(Oid);
+       ctl.entrysize = sizeof(SubplanResultRelHashElem);
+       ctl.hcxt = CurrentMemoryContext;
 
-       /* Release the tuple in the lowest parent's dedicated slot. */
-       if (slot == myslot)
-               ExecClearTuple(myslot);
+       htab = hash_create("PartitionTupleRouting table", mtstate->mt_nplans,
+                                          &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+       proute->subplan_resultrel_htab = htab;
 
-       /* A partition was not found. */
-       if (result < 0)
+       /* Hash all subplans by their Oid */
+       for (i = 0; i < mtstate->mt_nplans; i++)
        {
-               char       *val_desc;
-
-               val_desc = ExecBuildSlotPartitionKeyDescription(rel,
-                                                                                                               values, isnull, 64);
-               Assert(OidIsValid(RelationGetRelid(rel)));
-               ereport(ERROR,
-                               (errcode(ERRCODE_CHECK_VIOLATION),
-                                errmsg("no partition of relation \"%s\" found for row",
-                                               RelationGetRelationName(rel)),
-                                val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0));
-       }
+               ResultRelInfo *rri = &mtstate->resultRelInfo[i];
+               bool            found;
+               Oid                     partoid = RelationGetRelid(rri->ri_RelationDesc);
+               SubplanResultRelHashElem   *elem;
 
-       MemoryContextSwitchTo(oldcxt);
-       ecxt->ecxt_scantuple = ecxt_scantuple_old;
+               elem = (SubplanResultRelHashElem *)
+                       hash_search(htab, &partoid, HASH_ENTER, &found);
+               Assert(!found);
+               elem->rri = rri;
 
-       return result;
+               /*
+                * This is required in order to convert the partition's tuple to be
+                * compatible with the root partitioned table's tuple descriptor. When
+                * generating the per-subplan result rels, this was not set.
+                */
+               rri->ri_PartitionRoot = proute->partition_root;
+       }
 }
 
 /*
  * ExecInitPartitionInfo
  *             Initialize ResultRelInfo and other information for a partition
+ *             and store it in the next empty slot in the proute->partitions array.
  *
  * Returns the ResultRelInfo
  */
-ResultRelInfo *
-ExecInitPartitionInfo(ModifyTableState *mtstate,
-                                         ResultRelInfo *resultRelInfo,
+static ResultRelInfo *
+ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
                                          PartitionTupleRouting *proute,
-                                         EState *estate, int partidx)
+                                         PartitionDispatch dispatch,
+                                         ResultRelInfo *rootResultRelInfo,
+                                         int partidx)
 {
        ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
-       Relation        rootrel = resultRelInfo->ri_RelationDesc,
+       Relation        rootrel = rootResultRelInfo->ri_RelationDesc,
                                partrel;
        Relation        firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
        ResultRelInfo *leaf_part_rri;
-       MemoryContext oldContext;
+       MemoryContext oldcxt;
        AttrNumber *part_attnos = NULL;
        bool            found_whole_row;
 
+       oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
        /*
         * We locked all the partitions in ExecSetupPartitionTupleRouting
         * including the leaf partitions.
         */
-       partrel = heap_open(proute->partition_oids[partidx], NoLock);
-
-       /*
-        * Keep ResultRelInfo and other information for this partition in the
-        * per-query memory context so they'll survive throughout the query.
-        */
-       oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+       partrel = heap_open(dispatch->partdesc->oids[partidx], NoLock);
 
        leaf_part_rri = makeNode(ResultRelInfo);
        InitResultRelInfo(leaf_part_rri,
@@ -367,18 +523,6 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
         */
        CheckValidResultRel(leaf_part_rri, CMD_INSERT);
 
-       /*
-        * Since we've just initialized this ResultRelInfo, it's not in any list
-        * attached to the estate as yet.  Add it, so that it can be found later.
-        *
-        * Note that the entries in this list appear in no predetermined order,
-        * because partition result rels are initialized as and when they're
-        * needed.
-        */
-       estate->es_tuple_routing_result_relations =
-               lappend(estate->es_tuple_routing_result_relations,
-                               leaf_part_rri);
-
        /*
         * Open partition indices.  The user may have asked to check for conflicts
         * within this leaf partition and do "nothing" instead of throwing an
@@ -522,14 +666,14 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
        }
 
        /* Set up information needed for routing tuples to the partition. */
-       ExecInitRoutingInfo(mtstate, estate, proute, leaf_part_rri, partidx);
+       ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
+                                               leaf_part_rri, partidx);
 
        /*
         * If there is an ON CONFLICT clause, initialize state for it.
         */
        if (node && node->onConflictAction != ONCONFLICT_NONE)
        {
-               TupleConversionMap *map = proute->parent_child_tupconv_maps[partidx];
                int                     firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
                TupleDesc       partrelDesc = RelationGetDescr(partrel);
                ExprContext *econtext = mtstate->ps.ps_ExprContext;
@@ -542,7 +686,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
                 * list and searching for ancestry relationships to each index in the
                 * ancestor table.
                 */
-               if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) > 0)
+               if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0)
                {
                        List       *childIdxs;
 
@@ -555,7 +699,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
                                ListCell   *lc2;
 
                                ancestors = get_partition_ancestors(childIdx);
-                               foreach(lc2, resultRelInfo->ri_onConflictArbiterIndexes)
+                               foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes)
                                {
                                        if (list_member_oid(ancestors, lfirst_oid(lc2)))
                                                arbiterIndexes = lappend_oid(arbiterIndexes, childIdx);
@@ -569,7 +713,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
                 * (This shouldn't happen, since arbiter index selection should not
                 * pick up an invalid index.)
                 */
-               if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) !=
+               if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
                        list_length(arbiterIndexes))
                        elog(ERROR, "invalid arbiter index list");
                leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
@@ -579,8 +723,12 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
                 */
                if (node->onConflictAction == ONCONFLICT_UPDATE)
                {
+                       TupleConversionMap *map;
+
+                       map = leaf_part_rri->ri_PartitionInfo->pi_RootToPartitionMap;
+
                        Assert(node->onConflictSet != NIL);
-                       Assert(resultRelInfo->ri_onConflict != NULL);
+                       Assert(rootResultRelInfo->ri_onConflict != NULL);
 
                        /*
                         * If the partition's tuple descriptor matches exactly the root
@@ -589,7 +737,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
                         * need to create state specific to this partition.
                         */
                        if (map == NULL)
-                               leaf_part_rri->ri_onConflict = resultRelInfo->ri_onConflict;
+                               leaf_part_rri->ri_onConflict = rootResultRelInfo->ri_onConflict;
                        else
                        {
                                List       *onconflset;
@@ -680,37 +828,51 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
                }
        }
 
-       Assert(proute->partitions[partidx] == NULL);
-       proute->partitions[partidx] = leaf_part_rri;
+       /*
+        * Since we've just initialized this ResultRelInfo, it's not in any list
+        * attached to the estate as yet.  Add it, so that it can be found later.
+        *
+        * Note that the entries in this list appear in no predetermined order,
+        * because partition result rels are initialized as and when they're
+        * needed.
+        */
+       MemoryContextSwitchTo(estate->es_query_cxt);
+       estate->es_tuple_routing_result_relations =
+               lappend(estate->es_tuple_routing_result_relations,
+                               leaf_part_rri);
 
-       MemoryContextSwitchTo(oldContext);
+       MemoryContextSwitchTo(oldcxt);
 
        return leaf_part_rri;
 }
 
 /*
  * ExecInitRoutingInfo
- *             Set up information needed for routing tuples to a leaf partition
+ *             Set up information needed for translating tuples between root
+ *             partitioned table format and partition format, and keep track of it
+ *             in PartitionTupleRouting.
  */
-void
+static void
 ExecInitRoutingInfo(ModifyTableState *mtstate,
                                        EState *estate,
                                        PartitionTupleRouting *proute,
+                                       PartitionDispatch dispatch,
                                        ResultRelInfo *partRelInfo,
                                        int partidx)
 {
-       MemoryContext oldContext;
+       MemoryContext oldcxt;
+       PartitionRoutingInfo *partrouteinfo;
+       int             rri_index;
 
-       /*
-        * Switch into per-query memory context.
-        */
-       oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+       oldcxt = MemoryContextSwitchTo(proute->memcxt);
+
+       partrouteinfo = palloc(sizeof(PartitionRoutingInfo));
 
        /*
         * Set up a tuple conversion map to convert a tuple routed to the
         * partition from the parent's type to the partition's.
         */
-       proute->parent_child_tupconv_maps[partidx] =
+       partrouteinfo->pi_RootToPartitionMap =
                convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot),
                                                           RelationGetDescr(partRelInfo->ri_RelationDesc),
                                                           gettext_noop("could not convert row type"));
@@ -721,29 +883,36 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
         * for various operations that are applied to tuples after routing, such
         * as checking constraints.
         */
-       if (proute->parent_child_tupconv_maps[partidx] != NULL)
+       if (partrouteinfo->pi_RootToPartitionMap != NULL)
        {
                Relation        partrel = partRelInfo->ri_RelationDesc;
 
-               /*
-                * Initialize the array in proute where these slots are stored, if not
-                * already done.
-                */
-               if (proute->partition_tuple_slots == NULL)
-                       proute->partition_tuple_slots = (TupleTableSlot **)
-                               palloc0(proute->num_partitions *
-                                               sizeof(TupleTableSlot *));
-
                /*
                 * Initialize the slot itself setting its descriptor to this
                 * partition's TupleDesc; TupleDesc reference will be released at the
                 * end of the command.
                 */
-               proute->partition_tuple_slots[partidx] =
-                       ExecInitExtraTupleSlot(estate,
-                                                                  RelationGetDescr(partrel),
+               partrouteinfo->pi_PartitionTupleSlot =
+                       ExecInitExtraTupleSlot(estate, RelationGetDescr(partrel),
                                                                   &TTSOpsHeapTuple);
        }
+       else
+               partrouteinfo->pi_PartitionTupleSlot = NULL;
+
+       /*
+        * Also, if transition capture is required, store a map to convert tuples
+        * from partition's rowtype to the root partition table's.
+        */
+       if (mtstate &&
+               (mtstate->mt_transition_capture || mtstate->mt_oc_transition_capture))
+       {
+               partrouteinfo->pi_PartitionToRootMap =
+                       convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_RelationDesc),
+                                                                  RelationGetDescr(partRelInfo->ri_PartitionRoot),
+                                                                  gettext_noop("could not convert row type"));
+       }
+       else
+               partrouteinfo->pi_PartitionToRootMap = NULL;
 
        /*
         * If the partition is a foreign table, let the FDW init itself for
@@ -753,73 +922,138 @@ ExecInitRoutingInfo(ModifyTableState *mtstate,
                partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
                partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
 
-       MemoryContextSwitchTo(oldContext);
-
-       partRelInfo->ri_PartitionReadyForRouting = true;
-}
-
-/*
- * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition
- * child-to-root tuple conversion map array.
- *
- * This map is required for capturing transition tuples when the target table
- * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE,
- * we need to convert it from the leaf partition to the target table
- * descriptor.
- */
-void
-ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute)
-{
-       Assert(proute != NULL);
+       partRelInfo->ri_PartitionInfo = partrouteinfo;
 
        /*
-        * These array elements get filled up with maps on an on-demand basis.
-        * Initially just set all of them to NULL.
+        * Keep track of it in the PartitionTupleRouting->partitions array.
         */
-       proute->child_parent_tupconv_maps =
-               (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) *
-                                                                               proute->num_partitions);
+       Assert(dispatch->indexes[partidx] == -1);
+
+       rri_index = proute->num_partitions++;
+
+       /* Allocate or enlarge the array, as needed */
+       if (proute->num_partitions >= proute->max_partitions)
+       {
+               if (proute->max_partitions == 0)
+               {
+                       proute->max_partitions = 8;
+                       proute->partitions = (ResultRelInfo **)
+                               palloc(sizeof(ResultRelInfo *) * proute->max_partitions);
+               }
+               else
+               {
+                       proute->max_partitions *= 2;
+                       proute->partitions = (ResultRelInfo **)
+                               repalloc(proute->partitions, sizeof(ResultRelInfo *) *
+                                                proute->max_partitions);
+               }
+       }
 
-       /* Same is the case for this array. All the values are set to false */
-       proute->child_parent_map_not_required =
-               (bool *) palloc0(sizeof(bool) * proute->num_partitions);
+       proute->partitions[rri_index] = partRelInfo;
+       dispatch->indexes[partidx] = rri_index;
+
+       MemoryContextSwitchTo(oldcxt);
 }
 
 /*
- * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition
- * index.
+ * ExecInitPartitionDispatchInfo
+ *             Initialize PartitionDispatch for a partitioned table and store it in
+ *             the next available slot in the proute->partition_dispatch_info array.
+ *             Also, record the index into this array in the parent_pd->indexes[]
+ *             array in the partidx element so that we can properly retrieve the
+ *             newly created PartitionDispatch later.
  */
-TupleConversionMap *
-TupConvMapForLeaf(PartitionTupleRouting *proute,
-                                 ResultRelInfo *rootRelInfo, int leaf_index)
+static PartitionDispatch
+ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid,
+                                                         PartitionDispatch parent_pd, int partidx)
 {
-       ResultRelInfo **resultRelInfos = proute->partitions;
-       TupleConversionMap **map;
-       TupleDesc       tupdesc;
+       Relation        rel;
+       PartitionDesc partdesc;
+       PartitionDispatch pd;
+       int                     dispatchidx;
+       MemoryContext oldcxt;
 
-       /* Don't call this if we're not supposed to be using this type of map. */
-       Assert(proute->child_parent_tupconv_maps != NULL);
+       oldcxt = MemoryContextSwitchTo(proute->memcxt);
 
-       /* If it's already known that we don't need a map, return NULL. */
-       if (proute->child_parent_map_not_required[leaf_index])
-               return NULL;
+       if (partoid != RelationGetRelid(proute->partition_root))
+               rel = heap_open(partoid, NoLock);
+       else
+               rel = proute->partition_root;
+       partdesc = RelationGetPartitionDesc(rel);
 
-       /* If we've already got a map, return it. */
-       map = &proute->child_parent_tupconv_maps[leaf_index];
-       if (*map != NULL)
-               return *map;
+       pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
+                                                                       partdesc->nparts * sizeof(int));
+       pd->reldesc = rel;
+       pd->key = RelationGetPartitionKey(rel);
+       pd->keystate = NIL;
+       pd->partdesc = partdesc;
+       if (parent_pd != NULL)
+       {
+               TupleDesc       tupdesc = RelationGetDescr(rel);
 
-       /* No map yet; try to create one. */
-       tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc);
-       *map =
-               convert_tuples_by_name(tupdesc,
-                                                          RelationGetDescr(rootRelInfo->ri_RelationDesc),
-                                                          gettext_noop("could not convert row type"));
+               /*
+                * For sub-partitioned tables where the column order differs from its
+                * direct parent partitioned table, we must store a tuple table slot
+                * initialized with its tuple descriptor and a tuple conversion map to
+                * convert a tuple from its parent's rowtype to its own.  This is to
+                * make sure that we are looking at the correct row using the correct
+                * tuple descriptor when computing its partition key for tuple
+                * routing.
+                */
+               pd->tupmap = convert_tuples_by_name_map_if_req(RelationGetDescr(parent_pd->reldesc),
+                                                                                                          tupdesc,
+                                                                                                          gettext_noop("could not convert row type"));
+               pd->tupslot = pd->tupmap ?
+                       MakeSingleTupleTableSlot(tupdesc, &TTSOpsHeapTuple) : NULL;
+       }
+       else
+       {
+               /* Not required for the root partitioned table */
+               pd->tupmap = NULL;
+               pd->tupslot = NULL;
+       }
 
-       /* If it turns out no map is needed, remember for next time. */
-       proute->child_parent_map_not_required[leaf_index] = (*map == NULL);
+       /*
+        * Initialize with -1 to signify that the corresponding partition's
+        * ResultRelInfo or PartitionDispatch has not been created yet.
+        */
+       memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
+
+       /* Track in PartitionTupleRouting for later use */
+       dispatchidx = proute->num_dispatch++;
+
+       /* Allocate or enlarge the array, as needed */
+       if (proute->num_dispatch >= proute->max_dispatch)
+       {
+               if (proute->max_dispatch == 0)
+               {
+                       proute->max_dispatch = 4;
+                       proute->partition_dispatch_info = (PartitionDispatch *)
+                               palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
+               }
+               else
+               {
+                       proute->max_dispatch *= 2;
+                       proute->partition_dispatch_info = (PartitionDispatch *)
+                               repalloc(proute->partition_dispatch_info,
+                                                sizeof(PartitionDispatch) * proute->max_dispatch);
+               }
+       }
+       proute->partition_dispatch_info[dispatchidx] = pd;
 
-       return *map;
+       /*
+        * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
+        * install a downlink in the parent to allow quick descent.
+        */
+       if (parent_pd)
+       {
+               Assert(parent_pd->indexes[partidx] == -1);
+               parent_pd->indexes[partidx] = dispatchidx;
+       }
+
+       MemoryContextSwitchTo(oldcxt);
+
+       return pd;
 }
 
 /*
@@ -832,8 +1066,8 @@ void
 ExecCleanupTupleRouting(ModifyTableState *mtstate,
                                                PartitionTupleRouting *proute)
 {
+       HTAB       *htab = proute->subplan_resultrel_htab;
        int                     i;
-       int                     subplan_index = 0;
 
        /*
         * Remember, proute->partition_dispatch_info[0] corresponds to the root
@@ -847,187 +1081,40 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate,
                PartitionDispatch pd = proute->partition_dispatch_info[i];
 
                heap_close(pd->reldesc, NoLock);
-               ExecDropSingleTupleTableSlot(pd->tupslot);
+
+               if (pd->tupslot)
+                       ExecDropSingleTupleTableSlot(pd->tupslot);
        }
 
        for (i = 0; i < proute->num_partitions; i++)
        {
                ResultRelInfo *resultRelInfo = proute->partitions[i];
 
-               /* skip further processing for uninitialized partitions */
-               if (resultRelInfo == NULL)
-                       continue;
+               /*
+                * Check if this result rel is one belonging to the node's subplans,
+                * if so, let ExecEndPlan() clean it up.
+                */
+               if (htab)
+               {
+                       Oid                     partoid;
+                       bool            found;
+
+                       partoid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
+
+                       (void) hash_search(htab, &partoid, HASH_FIND, &found);
+                       if (found)
+                               continue;
+               }
 
                /* Allow any FDWs to shut down if they've been exercised */
-               if (resultRelInfo->ri_PartitionReadyForRouting &&
-                       resultRelInfo->ri_FdwRoutine != NULL &&
+               if (resultRelInfo->ri_FdwRoutine != NULL &&
                        resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
                        resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
                                                                                                                   resultRelInfo);
 
-               /*
-                * If this result rel is one of the UPDATE subplan result rels, let
-                * ExecEndPlan() close it. For INSERT or COPY,
-                * proute->subplan_partition_offsets will always be NULL. Note that
-                * the subplan_partition_offsets array and the partitions array have
-                * the partitions in the same order. So, while we iterate over
-                * partitions array, we also iterate over the
-                * subplan_partition_offsets array in order to figure out which of the
-                * result rels are present in the UPDATE subplans.
-                */
-               if (proute->subplan_partition_offsets &&
-                       subplan_index < proute->num_subplan_partition_offsets &&
-                       proute->subplan_partition_offsets[subplan_index] == i)
-               {
-                       subplan_index++;
-                       continue;
-               }
-
                ExecCloseIndices(resultRelInfo);
                heap_close(resultRelInfo->ri_RelationDesc, NoLock);
        }
-
-       /* Release the standalone partition tuple descriptors, if any */
-       if (proute->root_tuple_slot)
-               ExecDropSingleTupleTableSlot(proute->root_tuple_slot);
-}
-
-/*
- * RelationGetPartitionDispatchInfo
- *             Returns information necessary to route tuples down a partition tree
- *
- * The number of elements in the returned array (that is, the number of
- * PartitionDispatch objects for the partitioned tables in the partition tree)
- * is returned in *num_parted and a list of the OIDs of all the leaf
- * partitions of rel is returned in *leaf_part_oids.
- *
- * All the relations in the partition tree (including 'rel') must have been
- * locked (using at least the AccessShareLock) by the caller.
- */
-static PartitionDispatch *
-RelationGetPartitionDispatchInfo(Relation rel,
-                                                                int *num_parted, List **leaf_part_oids)
-{
-       List       *pdlist = NIL;
-       PartitionDispatchData **pd;
-       ListCell   *lc;
-       int                     i;
-
-       Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
-
-       *num_parted = 0;
-       *leaf_part_oids = NIL;
-
-       get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
-       *num_parted = list_length(pdlist);
-       pd = (PartitionDispatchData **) palloc(*num_parted *
-                                                                                  sizeof(PartitionDispatchData *));
-       i = 0;
-       foreach(lc, pdlist)
-       {
-               pd[i++] = lfirst(lc);
-       }
-
-       return pd;
-}
-
-/*
- * get_partition_dispatch_recurse
- *             Recursively expand partition tree rooted at rel
- *
- * As the partition tree is expanded in a depth-first manner, we maintain two
- * global lists: of PartitionDispatch objects corresponding to partitioned
- * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
- *
- * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
- * the order in which the planner's expand_partitioned_rtentry() processes
- * them.  It's not necessarily the case that the offsets match up exactly,
- * because constraint exclusion might prune away some partitions on the
- * planner side, whereas we'll always have the complete list; but unpruned
- * partitions will appear in the same order in the plan as they are returned
- * here.
- */
-static void
-get_partition_dispatch_recurse(Relation rel, Relation parent,
-                                                          List **pds, List **leaf_part_oids)
-{
-       TupleDesc       tupdesc = RelationGetDescr(rel);
-       PartitionDesc partdesc = RelationGetPartitionDesc(rel);
-       PartitionKey partkey = RelationGetPartitionKey(rel);
-       PartitionDispatch pd;
-       int                     i;
-
-       check_stack_depth();
-
-       /* Build a PartitionDispatch for this table and add it to *pds. */
-       pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
-       *pds = lappend(*pds, pd);
-       pd->reldesc = rel;
-       pd->key = partkey;
-       pd->keystate = NIL;
-       pd->partdesc = partdesc;
-       if (parent != NULL)
-       {
-               /*
-                * For every partitioned table other than the root, we must store a
-                * tuple table slot initialized with its tuple descriptor and a tuple
-                * conversion map to convert a tuple from its parent's rowtype to its
-                * own. That is to make sure that we are looking at the correct row
-                * using the correct tuple descriptor when computing its partition key
-                * for tuple routing.
-                */
-               pd->tupslot = MakeSingleTupleTableSlot(tupdesc, &TTSOpsHeapTuple);
-               pd->tupmap = convert_tuples_by_name_map_if_req(RelationGetDescr(parent),
-                                                                                                          tupdesc,
-                                                                                                          gettext_noop("could not convert row type"));
-       }
-       else
-       {
-               /* Not required for the root partitioned table */
-               pd->tupslot = NULL;
-               pd->tupmap = NULL;
-       }
-
-       /*
-        * Go look at each partition of this table.  If it's a leaf partition,
-        * simply add its OID to *leaf_part_oids.  If it's a partitioned table,
-        * recursively call get_partition_dispatch_recurse(), so that its
-        * partitions are processed as well and a corresponding PartitionDispatch
-        * object gets added to *pds.
-        *
-        * The 'indexes' array is used when searching for a partition matching a
-        * given tuple.  The actual value we store here depends on whether the
-        * array element belongs to a leaf partition or a subpartitioned table.
-        * For leaf partitions we store the index into *leaf_part_oids, and for
-        * sub-partitioned tables we store a negative version of the index into
-        * the *pds list.  Both indexes are 0-based, but the first element of the
-        * *pds list is the root partition, so 0 always means the first leaf. When
-        * searching, if we see a negative value, the search must continue in the
-        * corresponding sub-partition; otherwise, we've identified the correct
-        * partition.
-        */
-       pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
-       for (i = 0; i < partdesc->nparts; i++)
-       {
-               Oid                     partrelid = partdesc->oids[i];
-
-               if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
-               {
-                       *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
-                       pd->indexes[i] = list_length(*leaf_part_oids) - 1;
-               }
-               else
-               {
-                       /*
-                        * We assume all tables in the partition tree were already locked
-                        * by the caller.
-                        */
-                       Relation        partrel = heap_open(partrelid, NoLock);
-
-                       pd->indexes[i] = -list_length(*pds);
-                       get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
-               }
-       }
 }
 
 /* ----------------
index bb344a7070abcfb42d929da811e001828203953f..65d46c8ea8b5004026a987c5a5ce891b9cb4f39d 100644 (file)
@@ -68,7 +68,6 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
                                                ResultRelInfo *targetRelInfo,
                                                TupleTableSlot *slot);
 static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
-static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate);
 static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
 static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
                                                int whichplan);
@@ -1157,7 +1156,8 @@ lreplace:;
                        tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
                        if (tupconv_map != NULL)
                                slot = execute_attr_map_slot(tupconv_map->attrMap,
-                                                                                        slot, proute->root_tuple_slot);
+                                                                                        slot,
+                                                                                        mtstate->mt_root_tuple_slot);
 
                        /*
                         * Prepare for tuple routing, making it look like we're inserting
@@ -1653,7 +1653,7 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
        if (mtstate->mt_transition_capture != NULL ||
                mtstate->mt_oc_transition_capture != NULL)
        {
-               ExecSetupChildParentMapForTcs(mtstate);
+               ExecSetupChildParentMapForSubplan(mtstate);
 
                /*
                 * Install the conversion map for the first plan for UPDATE and DELETE
@@ -1686,52 +1686,21 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
                                                TupleTableSlot *slot)
 {
        ModifyTable *node;
-       int                     partidx;
        ResultRelInfo *partrel;
+       PartitionRoutingInfo *partrouteinfo;
        HeapTuple       tuple;
        TupleConversionMap *map;
 
        /*
-        * Determine the target partition.  If ExecFindPartition does not find a
-        * partition after all, it doesn't return here; otherwise, the returned
-        * value is to be used as an index into the arrays for the ResultRelInfo
-        * and TupleConversionMap for the partition.
-        */
-       partidx = ExecFindPartition(targetRelInfo,
-                                                               proute->partition_dispatch_info,
-                                                               slot,
-                                                               estate);
-       Assert(partidx >= 0 && partidx < proute->num_partitions);
-
-       /*
-        * Get the ResultRelInfo corresponding to the selected partition; if not
-        * yet there, initialize it.
+        * Lookup the target partition's ResultRelInfo.  If ExecFindPartition does
+        * not find a valid partition for the tuple in 'slot' then an error is
+        * raised.  An error may also be raised if the found partition is not a
+        * valid target for INSERTs.  This is required since a partitioned table
+        * UPDATE to another partition becomes a DELETE+INSERT.
         */
-       partrel = proute->partitions[partidx];
-       if (partrel == NULL)
-               partrel = ExecInitPartitionInfo(mtstate, targetRelInfo,
-                                                                               proute, estate,
-                                                                               partidx);
-
-       /*
-        * Check whether the partition is routable if we didn't yet
-        *
-        * Note: an UPDATE of a partition key invokes an INSERT that moves the
-        * tuple to a new partition.  This check would be applied to a subplan
-        * partition of such an UPDATE that is chosen as the partition to route
-        * the tuple to.  The reason we do this check here rather than in
-        * ExecSetupPartitionTupleRouting is to avoid aborting such an UPDATE
-        * unnecessarily due to non-routable subplan partitions that may not be
-        * chosen for update tuple movement after all.
-        */
-       if (!partrel->ri_PartitionReadyForRouting)
-       {
-               /* Verify the partition is a valid target for INSERT. */
-               CheckValidResultRel(partrel, CMD_INSERT);
-
-               /* Set up information needed for routing tuples to the partition. */
-               ExecInitRoutingInfo(mtstate, estate, proute, partrel, partidx);
-       }
+       partrel = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate);
+       partrouteinfo = partrel->ri_PartitionInfo;
+       Assert(partrouteinfo != NULL);
 
        /*
         * Make it look like we are inserting into the partition.
@@ -1743,7 +1712,7 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
 
        /*
         * If we're capturing transition tuples, we might need to convert from the
-        * partition rowtype to parent rowtype.
+        * partition rowtype to root partitioned table's rowtype.
         */
        if (mtstate->mt_transition_capture != NULL)
        {
@@ -1756,7 +1725,7 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
                         */
                        mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
                        mtstate->mt_transition_capture->tcs_map =
-                               TupConvMapForLeaf(proute, targetRelInfo, partidx);
+                               partrouteinfo->pi_PartitionToRootMap;
                }
                else
                {
@@ -1771,20 +1740,17 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
        if (mtstate->mt_oc_transition_capture != NULL)
        {
                mtstate->mt_oc_transition_capture->tcs_map =
-                       TupConvMapForLeaf(proute, targetRelInfo, partidx);
+                       partrouteinfo->pi_PartitionToRootMap;
        }
 
        /*
         * Convert the tuple, if necessary.
         */
-       map = proute->parent_child_tupconv_maps[partidx];
+       map = partrouteinfo->pi_RootToPartitionMap;
        if (map != NULL)
        {
-               TupleTableSlot *new_slot;
+               TupleTableSlot *new_slot = partrouteinfo->pi_PartitionTupleSlot;
 
-               Assert(proute->partition_tuple_slots != NULL &&
-                          proute->partition_tuple_slots[partidx] != NULL);
-               new_slot = proute->partition_tuple_slots[partidx];
                slot = execute_attr_map_slot(map->attrMap, slot, new_slot);
        }
 
@@ -1822,17 +1788,6 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate)
        int                     numResultRelInfos = mtstate->mt_nplans;
        int                     i;
 
-       /*
-        * First check if there is already a per-subplan array allocated. Even if
-        * there is already a per-leaf map array, we won't require a per-subplan
-        * one, since we will use the subplan offset array to convert the subplan
-        * index to per-leaf index.
-        */
-       if (mtstate->mt_per_subplan_tupconv_maps ||
-               (mtstate->mt_partition_tuple_routing &&
-                mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps))
-               return;
-
        /*
         * Build array of conversion maps from each child's TupleDesc to the one
         * used in the target relation.  The map pointers may be NULL when no
@@ -1854,79 +1809,18 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate)
        }
 }
 
-/*
- * Initialize the child-to-root tuple conversion map array required for
- * capturing transition tuples.
- *
- * The map array can be indexed either by subplan index or by leaf-partition
- * index.  For transition tables, we need a subplan-indexed access to the map,
- * and where tuple-routing is present, we also require a leaf-indexed access.
- */
-static void
-ExecSetupChildParentMapForTcs(ModifyTableState *mtstate)
-{
-       PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-
-       /*
-        * If partition tuple routing is set up, we will require partition-indexed
-        * access. In that case, create the map array indexed by partition; we
-        * will still be able to access the maps using a subplan index by
-        * converting the subplan index to a partition index using
-        * subplan_partition_offsets. If tuple routing is not set up, it means we
-        * don't require partition-indexed access. In that case, create just a
-        * subplan-indexed map.
-        */
-       if (proute)
-       {
-               /*
-                * If a partition-indexed map array is to be created, the subplan map
-                * array has to be NULL.  If the subplan map array is already created,
-                * we won't be able to access the map using a partition index.
-                */
-               Assert(mtstate->mt_per_subplan_tupconv_maps == NULL);
-
-               ExecSetupChildParentMapForLeaf(proute);
-       }
-       else
-               ExecSetupChildParentMapForSubplan(mtstate);
-}
-
 /*
  * For a given subplan index, get the tuple conversion map.
  */
 static TupleConversionMap *
 tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan)
 {
-       /*
-        * If a partition-index tuple conversion map array is allocated, we need
-        * to first get the index into the partition array. Exactly *one* of the
-        * two arrays is allocated. This is because if there is a partition array
-        * required, we don't require subplan-indexed array since we can translate
-        * subplan index into partition index. And, we create a subplan-indexed
-        * array *only* if partition-indexed array is not required.
-        */
+       /* If nobody else set the per-subplan array of maps, do so ourselves. */
        if (mtstate->mt_per_subplan_tupconv_maps == NULL)
-       {
-               int                     leaf_index;
-               PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-
-               /*
-                * If subplan-indexed array is NULL, things should have been arranged
-                * to convert the subplan index to partition index.
-                */
-               Assert(proute && proute->subplan_partition_offsets != NULL &&
-                          whichplan < proute->num_subplan_partition_offsets);
-
-               leaf_index = proute->subplan_partition_offsets[whichplan];
+               ExecSetupChildParentMapForSubplan(mtstate);
 
-               return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate),
-                                                                leaf_index);
-       }
-       else
-       {
-               Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
-               return mtstate->mt_per_subplan_tupconv_maps[whichplan];
-       }
+       Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
+       return mtstate->mt_per_subplan_tupconv_maps[whichplan];
 }
 
 /* ----------------------------------------------------------------
@@ -2370,10 +2264,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
         * descriptor of a source partition does not match the root partitioned
         * table descriptor.  In such a case we need to convert tuples to the root
         * tuple descriptor, because the search for destination partition starts
-        * from the root.  Skip this setup if it's not a partition key update.
+        * from the root.  We'll also need a slot to store these converted tuples.
+        * We can skip this setup if it's not a partition key update.
         */
        if (update_tuple_routing_needed)
+       {
                ExecSetupChildParentMapForSubplan(mtstate);
+               mtstate->mt_root_tuple_slot = MakeTupleTableSlot(RelationGetDescr(rel),
+                                                                                                                &TTSOpsHeapTuple);
+       }
 
        /*
         * Initialize any WITH CHECK OPTION constraints if needed.
@@ -2716,10 +2615,18 @@ ExecEndModifyTable(ModifyTableState *node)
                                                                                                                   resultRelInfo);
        }
 
-       /* Close all the partitioned tables, leaf partitions, and their indices */
+       /*
+        * Close all the partitioned tables, leaf partitions, and their indices
+        * and release the slot used for tuple routing, if set.
+        */
        if (node->mt_partition_tuple_routing)
+       {
                ExecCleanupTupleRouting(node, node->mt_partition_tuple_routing);
 
+               if (node->mt_root_tuple_slot)
+                       ExecDropSingleTupleTableSlot(node->mt_root_tuple_slot);
+       }
+
        /*
         * Free the exprcontext
         */
index d5720518a8186038bc6a9d23c0043c8ef70999d5..2a1c1cb2e1af81681b8e1b82218423ddba712fd8 100644 (file)
@@ -1657,9 +1657,6 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
 /*
  * expand_partitioned_rtentry
  *             Recursively expand an RTE for a partitioned table.
- *
- * Note that RelationGetPartitionDispatchInfo will expand partitions in the
- * same order as this code.
  */
 static void
 expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
index 07653f312bbcd0ec03d39e1f53567c3042e41ca9..7856b47cdd96718faa678ab6a961bfe231403717 100644 (file)
@@ -340,15 +340,23 @@ RelationBuildPartitionDesc(Relation rel)
        oldcxt = MemoryContextSwitchTo(rel->rd_pdcxt);
        partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
        partdesc->oids = (Oid *) palloc(partdesc->nparts * sizeof(Oid));
+       partdesc->is_leaf = (bool *) palloc(partdesc->nparts * sizeof(bool));
 
        /*
         * Now assign OIDs from the original array into mapped indexes of the
-        * result array.  Order of OIDs in the former is defined by the catalog
-        * scan that retrieved them, whereas that in the latter is defined by
-        * canonicalized representation of the partition bounds.
+        * result array.  The order of OIDs in the former is defined by the
+        * catalog scan that retrieved them, whereas that in the latter is defined
+        * by canonicalized representation of the partition bounds.
         */
        for (i = 0; i < partdesc->nparts; i++)
-               partdesc->oids[mapping[i]] = oids_orig[i];
+       {
+               int                     index = mapping[i];
+
+               partdesc->oids[index] = oids_orig[i];
+               /* Record if the partition is a leaf partition */
+               partdesc->is_leaf[index] =
+                               (get_rel_relkind(oids_orig[i]) != RELKIND_PARTITIONED_TABLE);
+       }
        MemoryContextSwitchTo(oldcxt);
 
        rel->rd_partdesc = partdesc;
index a53de2372e4d0913894cf2c72d888b109381a4b4..59c7a6ab6986d4f955b229b1299c46066ee3a89a 100644 (file)
 typedef struct PartitionDescData
 {
        int                     nparts;                 /* Number of partitions */
-       Oid                *oids;                       /* OIDs of partitions */
+       Oid                *oids;                       /* Array of 'nparts' elements containing
+                                                                * partition OIDs in order of the their bounds */
+       bool       *is_leaf;            /* Array of 'nparts' elements storing whether
+                                                                * the corresponding 'oids' element belongs to
+                                                                * a leaf partition or not */
        PartitionBoundInfo boundinfo;   /* collection of partition bounds */
 } PartitionDescData;
 
index 3e08104ea4297e0b7ece1d9822d4e7fbe0f9ed74..d3cfb55f9f130aa44bb473e09117bbc0f6c43931 100644 (file)
 #include "nodes/plannodes.h"
 #include "partitioning/partprune.h"
 
-/* See execPartition.c for the definition. */
+/* See execPartition.c for the definitions. */
 typedef struct PartitionDispatchData *PartitionDispatch;
+typedef struct PartitionTupleRouting PartitionTupleRouting;
 
-/*-----------------------
- * PartitionTupleRouting - Encapsulates all information required to execute
- * tuple-routing between partitions.
+/*
+ * PartitionRoutingInfo
  *
- * partition_dispatch_info             Array of PartitionDispatch objects with one
- *                                                             entry for every partitioned table in the
- *                                                             partition tree.
- * num_dispatch                                        number of partitioned tables in the partition
- *                                                             tree (= length of partition_dispatch_info[])
- * partition_oids                              Array of leaf partitions OIDs with one entry
- *                                                             for every leaf partition in the partition tree,
- *                                                             initialized in full by
- *                                                             ExecSetupPartitionTupleRouting.
- * partitions                                  Array of ResultRelInfo* objects with one entry
- *                                                             for every leaf partition in the partition tree,
- *                                                             initialized lazily by ExecInitPartitionInfo.
- * num_partitions                              Number of leaf partitions in the partition tree
- *                                                             (= 'partitions_oid'/'partitions' array length)
- * parent_child_tupconv_maps   Array of TupleConversionMap objects with one
- *                                                             entry for every leaf partition (required to
- *                                                             convert tuple from the root table's rowtype to
- *                                                             a leaf partition's rowtype after tuple routing
- *                                                             is done)
- * child_parent_tupconv_maps   Array of TupleConversionMap objects with one
- *                                                             entry for every leaf partition (required to
- *                                                             convert an updated tuple from the leaf
- *                                                             partition's rowtype to the root table's rowtype
- *                                                             so that tuple routing can be done)
- * child_parent_map_not_required  Array of bool. True value means that a map is
- *                                                             determined to be not required for the given
- *                                                             partition. False means either we haven't yet
- *                                                             checked if a map is required, or it was
- *                                                             determined to be required.
- * subplan_partition_offsets   Integer array ordered by UPDATE subplans. Each
- *                                                             element of this array has the index into the
- *                                                             corresponding partition in partitions array.
- * num_subplan_partition_offsets  Length of 'subplan_partition_offsets' array
- * partition_tuple_slots               Array of TupleTableSlot objects; if non-NULL,
- *                                                             contains one entry for every leaf partition,
- *                                                             of which only those of the leaf partitions
- *                                                             whose attribute numbers differ from the root
- *                                                             parent have a non-NULL value.  NULL if all of
- *                                                             the partitions encountered by a given command
- *                                                             happen to have same rowtype as the root parent
- * root_tuple_slot                             TupleTableSlot to be used to transiently hold
- *                                                             copy of a tuple that's being moved across
- *                                                             partitions in the root partitioned table's
- *                                                             rowtype
- *-----------------------
+ * Additional result relation information specific to routing tuples to a
+ * table partition.
  */
-typedef struct PartitionTupleRouting
+typedef struct PartitionRoutingInfo
 {
-       PartitionDispatch *partition_dispatch_info;
-       int                     num_dispatch;
-       Oid                *partition_oids;
-       ResultRelInfo **partitions;
-       int                     num_partitions;
-       TupleConversionMap **parent_child_tupconv_maps;
-       TupleConversionMap **child_parent_tupconv_maps;
-       bool       *child_parent_map_not_required;
-       int                *subplan_partition_offsets;
-       int                     num_subplan_partition_offsets;
-       TupleTableSlot **partition_tuple_slots;
-       TupleTableSlot *root_tuple_slot;
-} PartitionTupleRouting;
+       /*
+        * Map for converting tuples in root partitioned table format into
+        * partition format, or NULL if no conversion is required.
+        */
+       TupleConversionMap *pi_RootToPartitionMap;
+
+       /*
+        * Map for converting tuples in partition format into the root partitioned
+        * table format, or NULL if no conversion is required.
+        */
+       TupleConversionMap *pi_PartitionToRootMap;
+
+       /*
+        * Slot to store tuples in partition format, or NULL when no translation
+        * is required between root and partition.
+        */
+       TupleTableSlot *pi_PartitionTupleSlot;
+} PartitionRoutingInfo;
 
 /*
  * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning
@@ -175,22 +137,11 @@ typedef struct PartitionPruneState
 
 extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
                                                           Relation rel);
-extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
-                                 PartitionDispatch *pd,
+extern ResultRelInfo *ExecFindPartition(ModifyTableState *mtstate,
+                                 ResultRelInfo *rootResultRelInfo,
+                                 PartitionTupleRouting *proute,
                                  TupleTableSlot *slot,
                                  EState *estate);
-extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
-                                         ResultRelInfo *resultRelInfo,
-                                         PartitionTupleRouting *proute,
-                                         EState *estate, int partidx);
-extern void ExecInitRoutingInfo(ModifyTableState *mtstate,
-                                       EState *estate,
-                                       PartitionTupleRouting *proute,
-                                       ResultRelInfo *partRelInfo,
-                                       int partidx);
-extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute);
-extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute,
-                                 ResultRelInfo *rootRelInfo, int leaf_index);
 extern void ExecCleanupTupleRouting(ModifyTableState *mtstate,
                                                PartitionTupleRouting *proute);
 extern PartitionPruneState *ExecCreatePartitionPruneState(PlanState *planstate,
index 63c871e6d03f7d95b8f593daf7166b0142fe2cf7..569cc7c476197ae80895f12186c7af7a0c91b54b 100644 (file)
@@ -33,6 +33,7 @@
 
 
 struct PlanState;                              /* forward references in this file */
+struct PartitionRoutingInfo;
 struct ParallelHashJoinState;
 struct ExecRowMark;
 struct ExprState;
@@ -469,8 +470,8 @@ typedef struct ResultRelInfo
        /* relation descriptor for root partitioned table */
        Relation        ri_PartitionRoot;
 
-       /* true if ready for tuple routing */
-       bool            ri_PartitionReadyForRouting;
+       /* Additional information specific to partition tuple routing */
+       struct PartitionRoutingInfo *ri_PartitionInfo;
 } ResultRelInfo;
 
 /* ----------------
@@ -1112,6 +1113,12 @@ typedef struct ModifyTableState
        List       *mt_excludedtlist;   /* the excluded pseudo relation's tlist  */
        TupleTableSlot *mt_conflproj;   /* CONFLICT ... SET ... projection target */
 
+       /*
+        * Slot for storing tuples in the root partitioned table's rowtype during
+        * an UPDATE of a partitioned table.
+        */
+       TupleTableSlot *mt_root_tuple_slot;
+
        /* Tuple-routing support info */
        struct PartitionTupleRouting *mt_partition_tuple_routing;