From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 22 Dec 2016 22:31:52 +0000 (-0500)
Subject: Fix tuple routing in cases where tuple descriptors don't match.
X-Git-Tag: REL_10_BETA1~1199
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2ac3ef7a01df859c62d0a02333b646d65eaec5ff;p=postgresql

Fix tuple routing in cases where tuple descriptors don't match.

The previous coding failed to work correctly when we have a
multi-level partitioned hierarchy where tables at successive levels
have different attribute numbers for the partition key attributes.  To
fix, have each PartitionDispatch object store a standalone
TupleTableSlot initialized with the TupleDesc of the corresponding
partitioned table, along with a TupleConversionMap to map tuples from
the its parent's rowtype to own rowtype.  After tuple routing chooses
a leaf partition, we must use the leaf partition's tuple descriptor,
not the root table's.  To that end, a dedicated TupleTableSlot for
tuple routing is now allocated in EState.

Amit Langote
---

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 9980582b77..fca874752f 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
 	return generate_partition_qual(rel, recurse);
 }
 
-/* Turn an array of OIDs with N elements into a list */
-#define OID_ARRAY_TO_LIST(arr, N, list) \
+/*
+ * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
+ * append pointer rel to the list 'parents'.
+ */
+#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
 	do\
 	{\
 		int		i;\
-		for (i = 0; i < (N); i++)\
-			(list) = lappend_oid((list), (arr)[i]);\
+		for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
+		{\
+			(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
+			(parents) = lappend((parents), (rel));\
+		}\
 	} while(0)
 
 /*
@@ -944,11 +950,13 @@ PartitionDispatch *
 RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 								 int *num_parted, List **leaf_part_oids)
 {
-	PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
 	PartitionDispatchData **pd;
 	List	   *all_parts = NIL,
-			   *parted_rels;
-	ListCell   *lc;
+			   *all_parents = NIL,
+			   *parted_rels,
+			   *parted_rel_parents;
+	ListCell   *lc1,
+			   *lc2;
 	int			i,
 				k,
 				offset;
@@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 	 */
 	*num_parted = 1;
 	parted_rels = list_make1(rel);
-	OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
-	foreach(lc, all_parts)
+	/* Root partitioned table has no parent, so NULL for parent */
+	parted_rel_parents = list_make1(NULL);
+	APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
+	forboth(lc1, all_parts, lc2, all_parents)
 	{
-		Relation	partrel = heap_open(lfirst_oid(lc), lockmode);
+		Relation	partrel = heap_open(lfirst_oid(lc1), lockmode);
+		Relation	parent = lfirst(lc2);
 		PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
 
 		/*
@@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 		{
 			(*num_parted)++;
 			parted_rels = lappend(parted_rels, partrel);
-			OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
+			parted_rel_parents = lappend(parted_rel_parents, parent);
+			APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
 		}
 		else
 			heap_close(partrel, NoLock);
@@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 										   sizeof(PartitionDispatchData *));
 	*leaf_part_oids = NIL;
 	i = k = offset = 0;
-	foreach(lc, parted_rels)
+	forboth(lc1, parted_rels, lc2, parted_rel_parents)
 	{
-		Relation	partrel = lfirst(lc);
+		Relation	partrel = lfirst(lc1);
+		Relation	parent = lfirst(lc2);
 		PartitionKey partkey = RelationGetPartitionKey(partrel);
+		TupleDesc	 tupdesc = RelationGetDescr(partrel);
 		PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
 		int			j,
 					m;
@@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 		pd[i]->key = partkey;
 		pd[i]->keystate = NIL;
 		pd[i]->partdesc = partdesc;
+		if (parent != NULL)
+		{
+			/*
+			 * For every partitioned table other than root, we must store
+			 * a tuple table slot initialized with its tuple descriptor and
+			 * a tuple conversion map to convert a tuple from its parent's
+			 * rowtype to its own. That is to make sure that we are looking
+			 * at the correct row using the correct tuple descriptor when
+			 * computing its partition key for tuple routing.
+			 */
+			pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
+			pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
+												   tupdesc,
+								gettext_noop("could not convert row type"));
+		}
+		else
+		{
+			/* Not required for the root partitioned table */
+			pd[i]->tupslot = NULL;
+			pd[i]->tupmap = NULL;
+		}
 		pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
 
 		/*
@@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
 	{
 		PartitionKey key = parent->key;
 		PartitionDesc partdesc = parent->partdesc;
+		TupleTableSlot *myslot = parent->tupslot;
+		TupleConversionMap *map = parent->tupmap;
 
 		/* Quick exit */
 		if (partdesc->nparts == 0)
@@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
 			return -1;
 		}
 
+		if (myslot != NULL)
+		{
+			HeapTuple	tuple = ExecFetchSlotTuple(slot);
+
+			ExecClearTuple(myslot);
+			Assert(map != NULL);
+			tuple = do_convert_tuple(tuple, map);
+			ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
+			slot = myslot;
+		}
+
 		/* Extract partition key from tuple */
 		FormPartitionKeyDatum(parent, slot, estate, values, isnull);
 
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index d5901651db..aa25a23336 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
 	/* Triggers might need a slot as well */
 	estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
 
+	/*
+	 * Initialize a dedicated slot to manipulate tuples of any given
+	 * partition's rowtype.
+	 */
+	if (cstate->partition_dispatch_info)
+		estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
+	else
+		estate->es_partition_tuple_slot = NULL;
+
 	/*
 	 * It's more efficient to prepare a bunch of tuples for insertion, and
 	 * insert them in one heap_multi_insert() call, than call heap_insert()
@@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
 
 	for (;;)
 	{
-		TupleTableSlot *slot;
+		TupleTableSlot *slot,
+					   *oldslot = NULL;
 		bool		skip_tuple;
 		Oid			loaded_oid = InvalidOid;
 
@@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
 			map = cstate->partition_tupconv_maps[leaf_part_index];
 			if (map)
 			{
+				Relation	partrel = resultRelInfo->ri_RelationDesc;
+
 				tuple = do_convert_tuple(tuple, map);
+
+				/*
+				 * We must use the partition's tuple descriptor from this
+				 * point on.  Use a dedicated slot from this point on until
+				 * we're finished dealing with the partition.
+				 */
+				oldslot = slot;
+				slot = estate->es_partition_tuple_slot;
+				Assert(slot != NULL);
+				ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
 				ExecStoreTuple(tuple, slot, InvalidBuffer, true);
 			}
 
@@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
 			{
 				resultRelInfo = saved_resultRelInfo;
 				estate->es_result_relation_info = resultRelInfo;
+
+				/* Switch back to the slot corresponding to the root table */
+				Assert(oldslot != NULL);
+				slot = oldslot;
 			}
 		}
 	}
@@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
 		 * Remember cstate->partition_dispatch_info[0] corresponds to the root
 		 * partitioned table, which we must not try to close, because it is
 		 * the main target table of COPY that will be closed eventually by
-		 * DoCopy().
+		 * DoCopy().  Also, tupslot is NULL for the root partitioned table.
 		 */
 		for (i = 1; i < cstate->num_dispatch; i++)
 		{
 			PartitionDispatch pd = cstate->partition_dispatch_info[i];
 
 			heap_close(pd->reldesc, NoLock);
+			ExecDropSingleTupleTableSlot(pd->tupslot);
 		}
 		for (i = 0; i < cstate->num_partitions; i++)
 		{
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a9546106ce..0d85b151c2 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
 	Relation	resultRelationDesc;
 	Oid			newId;
 	List	   *recheckIndexes = NIL;
+	TupleTableSlot *oldslot = NULL;
 
 	/*
 	 * get the heap tuple out of the tuple table slot, making sure we have a
@@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
 		map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
 		if (map)
 		{
+			Relation partrel = resultRelInfo->ri_RelationDesc;
+
 			tuple = do_convert_tuple(tuple, map);
+
+			/*
+			 * We must use the partition's tuple descriptor from this
+			 * point on, until we're finished dealing with the partition.
+			 * Use the dedicated slot for that.
+			 */
+			oldslot = slot;
+			slot = estate->es_partition_tuple_slot;
+			Assert(slot != NULL);
+			ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
 			ExecStoreTuple(tuple, slot, InvalidBuffer, true);
 		}
 	}
@@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
 	{
 		resultRelInfo = saved_resultRelInfo;
 		estate->es_result_relation_info = resultRelInfo;
+
+		/* Switch back to the slot corresponding to the root table */
+		Assert(oldslot != NULL);
+		slot = oldslot;
 	}
 
 	/*
@@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		mtstate->mt_partitions = partitions;
 		mtstate->mt_num_partitions = num_partitions;
 		mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
+
+		/*
+		 * Initialize a dedicated slot to manipulate tuples of any given
+		 * partition's rowtype.
+		 */
+		estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
 	}
+	else
+		estate->es_partition_tuple_slot = NULL;
 
 	/*
 	 * Initialize any WITH CHECK OPTION constraints if needed.
@@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
 	 * Remember node->mt_partition_dispatch_info[0] corresponds to the root
 	 * partitioned table, which we must not try to close, because it is the
 	 * main target table of the query that will be closed by ExecEndPlan().
+	 * Also, tupslot is NULL for the root partitioned table.
 	 */
 	for (i = 1; i < node->mt_num_dispatch; i++)
 	{
 		PartitionDispatch pd = node->mt_partition_dispatch_info[i];
 
 		heap_close(pd->reldesc, NoLock);
+		ExecDropSingleTupleTableSlot(pd->tupslot);
 	}
 	for (i = 0; i < node->mt_num_partitions; i++)
 	{
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 21effbf87b..bf38df5d29 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
  *	key			Partition key information of the table
  *	keystate	Execution state required for expressions in the partition key
  *	partdesc	Partition descriptor of the table
+ *	tupslot		A standalone TupleTableSlot initialized with this table's tuple
+ *				descriptor
+ *	tupmap		TupleConversionMap to convert from the parent's rowtype to
+ *				this table's rowtype (when extracting the partition key of a
+ *				tuple just before routing it through this table)
  *	indexes		Array with partdesc->nparts members (for details on what
  *				individual members represent, see how they are set in
  *				RelationGetPartitionDispatchInfo())
@@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
 	PartitionKey			key;
 	List				   *keystate;	/* list of ExprState */
 	PartitionDesc			partdesc;
+	TupleTableSlot		   *tupslot;
+	TupleConversionMap	   *tupmap;
 	int					   *indexes;
 } PartitionDispatchData;
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index cc0821bcac..d43ec56a2b 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -384,6 +384,9 @@ typedef struct EState
 	TupleTableSlot *es_trig_oldtup_slot;		/* for TriggerEnabled */
 	TupleTableSlot *es_trig_newtup_slot;		/* for TriggerEnabled */
 
+	/* Slot used to manipulate a tuple after it is routed to a partition */
+	TupleTableSlot *es_partition_tuple_slot;
+
 	/* Parameter info: */
 	ParamListInfo es_param_list_info;	/* values of external params */
 	ParamExecData *es_param_exec_vals;	/* values of internal params */
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 561cefa3c4..49f667b119 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -300,3 +300,40 @@ drop cascades to table part_null
 drop cascades to table part_ee_ff
 drop cascades to table part_ee_ff1
 drop cascades to table part_ee_ff2
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+ attrelid | attname | attnum 
+----------+---------+--------
+ p        | a       |      1
+ p1       | a       |      2
+ p11      | a       |      4
+(3 rows)
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+ tableoid | a | b 
+----------+---+---
+ p11      | 1 | 2
+(1 row)
+
+-- cleanup
+drop table p cascade;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table p1
+drop cascades to table p11
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 846bb5897a..08dc068de8 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
 -- cleanup
 drop table range_parted cascade;
 drop table list_parted cascade;
+
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+
+-- cleanup
+drop table p cascade;