]> granicus.if.org Git - postgresql/commitdiff
Fix tuple routing in cases where tuple descriptors don't match.
authorRobert Haas <rhaas@postgresql.org>
Thu, 22 Dec 2016 22:31:52 +0000 (17:31 -0500)
committerRobert Haas <rhaas@postgresql.org>
Thu, 22 Dec 2016 22:36:37 +0000 (17:36 -0500)
The previous coding failed to work correctly when we have a
multi-level partitioned hierarchy where tables at successive levels
have different attribute numbers for the partition key attributes.  To
fix, have each PartitionDispatch object store a standalone
TupleTableSlot initialized with the TupleDesc of the corresponding
partitioned table, along with a TupleConversionMap to map tuples from
the its parent's rowtype to own rowtype.  After tuple routing chooses
a leaf partition, we must use the leaf partition's tuple descriptor,
not the root table's.  To that end, a dedicated TupleTableSlot for
tuple routing is now allocated in EState.

Amit Langote

src/backend/catalog/partition.c
src/backend/commands/copy.c
src/backend/executor/nodeModifyTable.c
src/include/catalog/partition.h
src/include/nodes/execnodes.h
src/test/regress/expected/insert.out
src/test/regress/sql/insert.sql

index 9980582b777af65838dcca0a7675b9c4ae70be14..fca874752f5d62a57fbd68b65a778bb96ad73e15 100644 (file)
@@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
        return generate_partition_qual(rel, recurse);
 }
 
-/* Turn an array of OIDs with N elements into a list */
-#define OID_ARRAY_TO_LIST(arr, N, list) \
+/*
+ * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
+ * append pointer rel to the list 'parents'.
+ */
+#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
        do\
        {\
                int             i;\
-               for (i = 0; i < (N); i++)\
-                       (list) = lappend_oid((list), (arr)[i]);\
+               for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
+               {\
+                       (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
+                       (parents) = lappend((parents), (rel));\
+               }\
        } while(0)
 
 /*
@@ -944,11 +950,13 @@ PartitionDispatch *
 RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                                                                 int *num_parted, List **leaf_part_oids)
 {
-       PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
        PartitionDispatchData **pd;
        List       *all_parts = NIL,
-                          *parted_rels;
-       ListCell   *lc;
+                          *all_parents = NIL,
+                          *parted_rels,
+                          *parted_rel_parents;
+       ListCell   *lc1,
+                          *lc2;
        int                     i,
                                k,
                                offset;
@@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
         */
        *num_parted = 1;
        parted_rels = list_make1(rel);
-       OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
-       foreach(lc, all_parts)
+       /* Root partitioned table has no parent, so NULL for parent */
+       parted_rel_parents = list_make1(NULL);
+       APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
+       forboth(lc1, all_parts, lc2, all_parents)
        {
-               Relation        partrel = heap_open(lfirst_oid(lc), lockmode);
+               Relation        partrel = heap_open(lfirst_oid(lc1), lockmode);
+               Relation        parent = lfirst(lc2);
                PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
 
                /*
@@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                {
                        (*num_parted)++;
                        parted_rels = lappend(parted_rels, partrel);
-                       OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
+                       parted_rel_parents = lappend(parted_rel_parents, parent);
+                       APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
                }
                else
                        heap_close(partrel, NoLock);
@@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                                                                                   sizeof(PartitionDispatchData *));
        *leaf_part_oids = NIL;
        i = k = offset = 0;
-       foreach(lc, parted_rels)
+       forboth(lc1, parted_rels, lc2, parted_rel_parents)
        {
-               Relation        partrel = lfirst(lc);
+               Relation        partrel = lfirst(lc1);
+               Relation        parent = lfirst(lc2);
                PartitionKey partkey = RelationGetPartitionKey(partrel);
+               TupleDesc        tupdesc = RelationGetDescr(partrel);
                PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
                int                     j,
                                        m;
@@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                pd[i]->key = partkey;
                pd[i]->keystate = NIL;
                pd[i]->partdesc = partdesc;
+               if (parent != NULL)
+               {
+                       /*
+                        * For every partitioned table other than root, we must store
+                        * a tuple table slot initialized with its tuple descriptor and
+                        * a tuple conversion map to convert a tuple from its parent's
+                        * rowtype to its own. That is to make sure that we are looking
+                        * at the correct row using the correct tuple descriptor when
+                        * computing its partition key for tuple routing.
+                        */
+                       pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
+                       pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
+                                                                                                  tupdesc,
+                                                               gettext_noop("could not convert row type"));
+               }
+               else
+               {
+                       /* Not required for the root partitioned table */
+                       pd[i]->tupslot = NULL;
+                       pd[i]->tupmap = NULL;
+               }
                pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
 
                /*
@@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
        {
                PartitionKey key = parent->key;
                PartitionDesc partdesc = parent->partdesc;
+               TupleTableSlot *myslot = parent->tupslot;
+               TupleConversionMap *map = parent->tupmap;
 
                /* Quick exit */
                if (partdesc->nparts == 0)
@@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
                        return -1;
                }
 
+               if (myslot != NULL)
+               {
+                       HeapTuple       tuple = ExecFetchSlotTuple(slot);
+
+                       ExecClearTuple(myslot);
+                       Assert(map != NULL);
+                       tuple = do_convert_tuple(tuple, map);
+                       ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
+                       slot = myslot;
+               }
+
                /* Extract partition key from tuple */
                FormPartitionKeyDatum(parent, slot, estate, values, isnull);
 
index d5901651db18c259de9235f895b72ea5dee83ad0..aa25a23336da0c3316f3bb207fd813728a9d9044 100644 (file)
@@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
        /* Triggers might need a slot as well */
        estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
 
+       /*
+        * Initialize a dedicated slot to manipulate tuples of any given
+        * partition's rowtype.
+        */
+       if (cstate->partition_dispatch_info)
+               estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
+       else
+               estate->es_partition_tuple_slot = NULL;
+
        /*
         * It's more efficient to prepare a bunch of tuples for insertion, and
         * insert them in one heap_multi_insert() call, than call heap_insert()
@@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
 
        for (;;)
        {
-               TupleTableSlot *slot;
+               TupleTableSlot *slot,
+                                          *oldslot = NULL;
                bool            skip_tuple;
                Oid                     loaded_oid = InvalidOid;
 
@@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
                        map = cstate->partition_tupconv_maps[leaf_part_index];
                        if (map)
                        {
+                               Relation        partrel = resultRelInfo->ri_RelationDesc;
+
                                tuple = do_convert_tuple(tuple, map);
+
+                               /*
+                                * We must use the partition's tuple descriptor from this
+                                * point on.  Use a dedicated slot from this point on until
+                                * we're finished dealing with the partition.
+                                */
+                               oldslot = slot;
+                               slot = estate->es_partition_tuple_slot;
+                               Assert(slot != NULL);
+                               ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
                                ExecStoreTuple(tuple, slot, InvalidBuffer, true);
                        }
 
@@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
                        {
                                resultRelInfo = saved_resultRelInfo;
                                estate->es_result_relation_info = resultRelInfo;
+
+                               /* Switch back to the slot corresponding to the root table */
+                               Assert(oldslot != NULL);
+                               slot = oldslot;
                        }
                }
        }
@@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
                 * Remember cstate->partition_dispatch_info[0] corresponds to the root
                 * partitioned table, which we must not try to close, because it is
                 * the main target table of COPY that will be closed eventually by
-                * DoCopy().
+                * DoCopy().  Also, tupslot is NULL for the root partitioned table.
                 */
                for (i = 1; i < cstate->num_dispatch; i++)
                {
                        PartitionDispatch pd = cstate->partition_dispatch_info[i];
 
                        heap_close(pd->reldesc, NoLock);
+                       ExecDropSingleTupleTableSlot(pd->tupslot);
                }
                for (i = 0; i < cstate->num_partitions; i++)
                {
index a9546106ceca8e438ea5597f527230c5bffe8d61..0d85b151c205746ea8fd1bd9c38a42a4749a7c78 100644 (file)
@@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
        Relation        resultRelationDesc;
        Oid                     newId;
        List       *recheckIndexes = NIL;
+       TupleTableSlot *oldslot = NULL;
 
        /*
         * get the heap tuple out of the tuple table slot, making sure we have a
@@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
                map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
                if (map)
                {
+                       Relation partrel = resultRelInfo->ri_RelationDesc;
+
                        tuple = do_convert_tuple(tuple, map);
+
+                       /*
+                        * We must use the partition's tuple descriptor from this
+                        * point on, until we're finished dealing with the partition.
+                        * Use the dedicated slot for that.
+                        */
+                       oldslot = slot;
+                       slot = estate->es_partition_tuple_slot;
+                       Assert(slot != NULL);
+                       ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
                        ExecStoreTuple(tuple, slot, InvalidBuffer, true);
                }
        }
@@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
        {
                resultRelInfo = saved_resultRelInfo;
                estate->es_result_relation_info = resultRelInfo;
+
+               /* Switch back to the slot corresponding to the root table */
+               Assert(oldslot != NULL);
+               slot = oldslot;
        }
 
        /*
@@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
                mtstate->mt_partitions = partitions;
                mtstate->mt_num_partitions = num_partitions;
                mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
+
+               /*
+                * Initialize a dedicated slot to manipulate tuples of any given
+                * partition's rowtype.
+                */
+               estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
        }
+       else
+               estate->es_partition_tuple_slot = NULL;
 
        /*
         * Initialize any WITH CHECK OPTION constraints if needed.
@@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
         * Remember node->mt_partition_dispatch_info[0] corresponds to the root
         * partitioned table, which we must not try to close, because it is the
         * main target table of the query that will be closed by ExecEndPlan().
+        * Also, tupslot is NULL for the root partitioned table.
         */
        for (i = 1; i < node->mt_num_dispatch; i++)
        {
                PartitionDispatch pd = node->mt_partition_dispatch_info[i];
 
                heap_close(pd->reldesc, NoLock);
+               ExecDropSingleTupleTableSlot(pd->tupslot);
        }
        for (i = 0; i < node->mt_num_partitions; i++)
        {
index 21effbf87b47900b0edf3b50c129b6c39545d778..bf38df5d29e52e186d647df530f797fb81797570 100644 (file)
@@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
  *     key                     Partition key information of the table
  *     keystate        Execution state required for expressions in the partition key
  *     partdesc        Partition descriptor of the table
+ *     tupslot         A standalone TupleTableSlot initialized with this table's tuple
+ *                             descriptor
+ *     tupmap          TupleConversionMap to convert from the parent's rowtype to
+ *                             this table's rowtype (when extracting the partition key of a
+ *                             tuple just before routing it through this table)
  *     indexes         Array with partdesc->nparts members (for details on what
  *                             individual members represent, see how they are set in
  *                             RelationGetPartitionDispatchInfo())
@@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
        PartitionKey                    key;
        List                               *keystate;   /* list of ExprState */
        PartitionDesc                   partdesc;
+       TupleTableSlot             *tupslot;
+       TupleConversionMap         *tupmap;
        int                                        *indexes;
 } PartitionDispatchData;
 
index cc0821bcac987bb3d473084fa790c8cf346ee8eb..d43ec56a2b3b40e603038c63a36050445bcaac87 100644 (file)
@@ -384,6 +384,9 @@ typedef struct EState
        TupleTableSlot *es_trig_oldtup_slot;            /* for TriggerEnabled */
        TupleTableSlot *es_trig_newtup_slot;            /* for TriggerEnabled */
 
+       /* Slot used to manipulate a tuple after it is routed to a partition */
+       TupleTableSlot *es_partition_tuple_slot;
+
        /* Parameter info: */
        ParamListInfo es_param_list_info;       /* values of external params */
        ParamExecData *es_param_exec_vals;      /* values of internal params */
index 561cefa3c4d6efe62977b3711a52069782649556..49f667b1194daf52f273f7f34ea6f179b4dce5a1 100644 (file)
@@ -300,3 +300,40 @@ drop cascades to table part_null
 drop cascades to table part_ee_ff
 drop cascades to table part_ee_ff1
 drop cascades to table part_ee_ff2
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+ attrelid | attname | attnum 
+----------+---------+--------
+ p        | a       |      1
+ p1       | a       |      2
+ p11      | a       |      4
+(3 rows)
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+ tableoid | a | b 
+----------+---+---
+ p11      | 1 | 2
+(1 row)
+
+-- cleanup
+drop table p cascade;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table p1
+drop cascades to table p11
index 846bb5897a301611d99d8c7259c04dffb0bfa2a1..08dc068de8069c28e569263744ceede7d7da0bd8 100644 (file)
@@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
 -- cleanup
 drop table range_parted cascade;
 drop table list_parted cascade;
+
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+
+-- cleanup
+drop table p cascade;