Fix tuple routing in cases where tuple descriptors don't match.

author Robert Haas <rhaas@postgresql.org>

Thu, 22 Dec 2016 22:31:52 +0000 (17:31 -0500)

committer Robert Haas <rhaas@postgresql.org>

Thu, 22 Dec 2016 22:36:37 +0000 (17:36 -0500)
author Robert Haas <rhaas@postgresql.org>
Thu, 22 Dec 2016 22:31:52 +0000 (17:31 -0500)
committer Robert Haas <rhaas@postgresql.org>
Thu, 22 Dec 2016 22:36:37 +0000 (17:36 -0500)
diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c

index 9980582b777af65838dcca0a7675b9c4ae70be14..fca874752f5d62a57fbd68b65a778bb96ad73e15 100644 (file)
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
         return generate_partition_qual(rel, recurse);
  }
  
-/* Turn an array of OIDs with N elements into a list */
-#define OID_ARRAY_TO_LIST(arr, N, list) \
+/*
+ * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
+ * append pointer rel to the list 'parents'.
+ */
+#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
         do\
         {\
                 int             i;\
-               for (i = 0; i < (N); i++)\
-                       (list) = lappend_oid((list), (arr)[i]);\
+               for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
+               {\
+                       (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
+                       (parents) = lappend((parents), (rel));\
+               }\
         } while(0)
  
  /*
@@ -944,11 +950,13 @@ PartitionDispatch *
  RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                                                                  int *num_parted, List **leaf_part_oids)
  {
-       PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
         PartitionDispatchData **pd;
         List       *all_parts = NIL,
-                          *parted_rels;
-       ListCell   *lc;
+                          *all_parents = NIL,
+                          *parted_rels,
+                          *parted_rel_parents;
+       ListCell   *lc1,
+                          *lc2;
         int                     i,
                                 k,
                                 offset;
@@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
          */
         *num_parted = 1;
         parted_rels = list_make1(rel);
-       OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
-       foreach(lc, all_parts)
+       /* Root partitioned table has no parent, so NULL for parent */
+       parted_rel_parents = list_make1(NULL);
+       APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
+       forboth(lc1, all_parts, lc2, all_parents)
         {
-               Relation        partrel = heap_open(lfirst_oid(lc), lockmode);
+               Relation        partrel = heap_open(lfirst_oid(lc1), lockmode);
+               Relation        parent = lfirst(lc2);
                 PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
  
                 /*
@@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                 {
                         (*num_parted)++;
                         parted_rels = lappend(parted_rels, partrel);
-                       OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
+                       parted_rel_parents = lappend(parted_rel_parents, parent);
+                       APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
                 }
                 else
                         heap_close(partrel, NoLock);
@@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                                                                                    sizeof(PartitionDispatchData *));
         *leaf_part_oids = NIL;
         i = k = offset = 0;
-       foreach(lc, parted_rels)
+       forboth(lc1, parted_rels, lc2, parted_rel_parents)
         {
-               Relation        partrel = lfirst(lc);
+               Relation        partrel = lfirst(lc1);
+               Relation        parent = lfirst(lc2);
                 PartitionKey partkey = RelationGetPartitionKey(partrel);
+               TupleDesc        tupdesc = RelationGetDescr(partrel);
                 PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
                 int                     j,
                                         m;
@@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                 pd[i]->key = partkey;
                 pd[i]->keystate = NIL;
                 pd[i]->partdesc = partdesc;
+               if (parent != NULL)
+               {
+                       /*
+                        * For every partitioned table other than root, we must store
+                        * a tuple table slot initialized with its tuple descriptor and
+                        * a tuple conversion map to convert a tuple from its parent's
+                        * rowtype to its own. That is to make sure that we are looking
+                        * at the correct row using the correct tuple descriptor when
+                        * computing its partition key for tuple routing.
+                        */
+                       pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
+                       pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
+                                                                                                  tupdesc,
+                                                               gettext_noop("could not convert row type"));
+               }
+               else
+               {
+                       /* Not required for the root partitioned table */
+                       pd[i]->tupslot = NULL;
+                       pd[i]->tupmap = NULL;
+               }
                 pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
  
                 /*
@@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
         {
                 PartitionKey key = parent->key;
                 PartitionDesc partdesc = parent->partdesc;
+               TupleTableSlot *myslot = parent->tupslot;
+               TupleConversionMap *map = parent->tupmap;
  
                 /* Quick exit */
                 if (partdesc->nparts == 0)
@@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
                         return -1;
                 }
  
+               if (myslot != NULL)
+               {
+                       HeapTuple       tuple = ExecFetchSlotTuple(slot);
+
+                       ExecClearTuple(myslot);
+                       Assert(map != NULL);
+                       tuple = do_convert_tuple(tuple, map);
+                       ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
+                       slot = myslot;
+               }
+
                 /* Extract partition key from tuple */
                 FormPartitionKeyDatum(parent, slot, estate, values, isnull);
  
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index d5901651db18c259de9235f895b72ea5dee83ad0..aa25a23336da0c3316f3bb207fd813728a9d9044 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
         /* Triggers might need a slot as well */
         estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
  
+       /*
+        * Initialize a dedicated slot to manipulate tuples of any given
+        * partition's rowtype.
+        */
+       if (cstate->partition_dispatch_info)
+               estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
+       else
+               estate->es_partition_tuple_slot = NULL;
+
         /*
          * It's more efficient to prepare a bunch of tuples for insertion, and
          * insert them in one heap_multi_insert() call, than call heap_insert()
@@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
  
         for (;;)
         {
-               TupleTableSlot *slot;
+               TupleTableSlot *slot,
+                                          *oldslot = NULL;
                 bool            skip_tuple;
                 Oid                     loaded_oid = InvalidOid;
  
@@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
                         map = cstate->partition_tupconv_maps[leaf_part_index];
                         if (map)
                         {
+                               Relation        partrel = resultRelInfo->ri_RelationDesc;
+
                                 tuple = do_convert_tuple(tuple, map);
+
+                               /*
+                                * We must use the partition's tuple descriptor from this
+                                * point on.  Use a dedicated slot from this point on until
+                                * we're finished dealing with the partition.
+                                */
+                               oldslot = slot;
+                               slot = estate->es_partition_tuple_slot;
+                               Assert(slot != NULL);
+                               ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
                                 ExecStoreTuple(tuple, slot, InvalidBuffer, true);
                         }
  
@@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
                         {
                                 resultRelInfo = saved_resultRelInfo;
                                 estate->es_result_relation_info = resultRelInfo;
+
+                               /* Switch back to the slot corresponding to the root table */
+                               Assert(oldslot != NULL);
+                               slot = oldslot;
                         }
                 }
         }
@@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
                  * Remember cstate->partition_dispatch_info[0] corresponds to the root
                  * partitioned table, which we must not try to close, because it is
                  * the main target table of COPY that will be closed eventually by
-                * DoCopy().
+                * DoCopy().  Also, tupslot is NULL for the root partitioned table.
                  */
                 for (i = 1; i < cstate->num_dispatch; i++)
                 {
                         PartitionDispatch pd = cstate->partition_dispatch_info[i];
  
                         heap_close(pd->reldesc, NoLock);
+                       ExecDropSingleTupleTableSlot(pd->tupslot);
                 }
                 for (i = 0; i < cstate->num_partitions; i++)
                 {
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c

index a9546106ceca8e438ea5597f527230c5bffe8d61..0d85b151c205746ea8fd1bd9c38a42a4749a7c78 100644 (file)
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
         Relation        resultRelationDesc;
         Oid                     newId;
         List       *recheckIndexes = NIL;
+       TupleTableSlot *oldslot = NULL;
  
         /*
          * get the heap tuple out of the tuple table slot, making sure we have a
@@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
                 map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
                 if (map)
                 {
+                       Relation partrel = resultRelInfo->ri_RelationDesc;
+
                         tuple = do_convert_tuple(tuple, map);
+
+                       /*
+                        * We must use the partition's tuple descriptor from this
+                        * point on, until we're finished dealing with the partition.
+                        * Use the dedicated slot for that.
+                        */
+                       oldslot = slot;
+                       slot = estate->es_partition_tuple_slot;
+                       Assert(slot != NULL);
+                       ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
                         ExecStoreTuple(tuple, slot, InvalidBuffer, true);
                 }
         }
@@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
         {
                 resultRelInfo = saved_resultRelInfo;
                 estate->es_result_relation_info = resultRelInfo;
+
+               /* Switch back to the slot corresponding to the root table */
+               Assert(oldslot != NULL);
+               slot = oldslot;
         }
  
         /*
@@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
                 mtstate->mt_partitions = partitions;
                 mtstate->mt_num_partitions = num_partitions;
                 mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
+
+               /*
+                * Initialize a dedicated slot to manipulate tuples of any given
+                * partition's rowtype.
+                */
+               estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
         }
+       else
+               estate->es_partition_tuple_slot = NULL;
  
         /*
          * Initialize any WITH CHECK OPTION constraints if needed.
@@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
          * Remember node->mt_partition_dispatch_info[0] corresponds to the root
          * partitioned table, which we must not try to close, because it is the
          * main target table of the query that will be closed by ExecEndPlan().
+        * Also, tupslot is NULL for the root partitioned table.
          */
         for (i = 1; i < node->mt_num_dispatch; i++)
         {
                 PartitionDispatch pd = node->mt_partition_dispatch_info[i];
  
                 heap_close(pd->reldesc, NoLock);
+               ExecDropSingleTupleTableSlot(pd->tupslot);
         }
         for (i = 0; i < node->mt_num_partitions; i++)
         {
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h

index 21effbf87b47900b0edf3b50c129b6c39545d778..bf38df5d29e52e186d647df530f797fb81797570 100644 (file)
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
   *     key                     Partition key information of the table
   *     keystate        Execution state required for expressions in the partition key
   *     partdesc        Partition descriptor of the table
+ *     tupslot         A standalone TupleTableSlot initialized with this table's tuple
+ *                             descriptor
+ *     tupmap          TupleConversionMap to convert from the parent's rowtype to
+ *                             this table's rowtype (when extracting the partition key of a
+ *                             tuple just before routing it through this table)
   *     indexes         Array with partdesc->nparts members (for details on what
   *                             individual members represent, see how they are set in
   *                             RelationGetPartitionDispatchInfo())
@@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
         PartitionKey                    key;
         List                               *keystate;   /* list of ExprState */
         PartitionDesc                   partdesc;
+       TupleTableSlot             *tupslot;
+       TupleConversionMap         *tupmap;
         int                                        *indexes;
  } PartitionDispatchData;
  
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h

index cc0821bcac987bb3d473084fa790c8cf346ee8eb..d43ec56a2b3b40e603038c63a36050445bcaac87 100644 (file)
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -384,6 +384,9 @@ typedef struct EState
         TupleTableSlot *es_trig_oldtup_slot;            /* for TriggerEnabled */
         TupleTableSlot *es_trig_newtup_slot;            /* for TriggerEnabled */
  
+       /* Slot used to manipulate a tuple after it is routed to a partition */
+       TupleTableSlot *es_partition_tuple_slot;
+
         /* Parameter info: */
         ParamListInfo es_param_list_info;       /* values of external params */
         ParamExecData *es_param_exec_vals;      /* values of internal params */
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out

index 561cefa3c4d6efe62977b3711a52069782649556..49f667b1194daf52f273f7f34ea6f179b4dce5a1 100644 (file)
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -300,3 +300,40 @@ drop cascades to table part_null
  drop cascades to table part_ee_ff
  drop cascades to table part_ee_ff1
  drop cascades to table part_ee_ff2
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+ attrelid | attname | attnum 
+----------+---------+--------
+ p        | a       |      1
+ p1       | a       |      2
+ p11      | a       |      4
+(3 rows)
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+ tableoid | a | b 
+----------+---+---
+ p11      | 1 | 2
+(1 row)
+
+-- cleanup
+drop table p cascade;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table p1
+drop cascades to table p11
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql

index 846bb5897a301611d99d8c7259c04dffb0bfa2a1..08dc068de8069c28e569263744ceede7d7da0bd8 100644 (file)
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
  -- cleanup
  drop table range_parted cascade;
  drop table list_parted cascade;
+
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+
+-- cleanup
+drop table p cascade;
author	Robert Haas <rhaas@postgresql.org>
	Thu, 22 Dec 2016 22:31:52 +0000 (17:31 -0500)
committer	Robert Haas <rhaas@postgresql.org>
	Thu, 22 Dec 2016 22:36:37 +0000 (17:36 -0500)
src/backend/catalog/partition.c		patch \| blob \| history
src/backend/commands/copy.c		patch \| blob \| history
src/backend/executor/nodeModifyTable.c		patch \| blob \| history
src/include/catalog/partition.h		patch \| blob \| history
src/include/nodes/execnodes.h		patch \| blob \| history
src/test/regress/expected/insert.out		patch \| blob \| history
src/test/regress/sql/insert.sql		patch \| blob \| history