Allow using the updated tuple while moving it to a different partition.

author Amit Kapila <akapila@postgresql.org>

Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)

committer Amit Kapila <akapila@postgresql.org>

Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)
author Amit Kapila <akapila@postgresql.org>
Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)
committer Amit Kapila <akapila@postgresql.org>
Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c

index 57519fe8d64751a4ea18d0e43f098272f5a8159d..2436692eb859dc5b84b9b178be375c5e1e62b47d 100644 (file)
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2726,11 +2726,19 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
                                                           false, NULL, NULL, NIL, NULL, transition_capture);
  }
  
+/*
+ * Execute BEFORE ROW DELETE triggers.
+ *
+ * True indicates caller can proceed with the delete.  False indicates caller
+ * need to suppress the delete and additionally if requested, we need to pass
+ * back the concurrently updated tuple if any.
+ */
  bool
  ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
                                          ResultRelInfo *relinfo,
                                          ItemPointer tupleid,
-                                        HeapTuple fdw_trigtuple)
+                                        HeapTuple fdw_trigtuple,
+                                        TupleTableSlot **epqslot)
  {
         TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
         bool            result = true;
@@ -2747,6 +2755,18 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
                                                                            LockTupleExclusive, &newSlot);
                 if (trigtuple == NULL)
                         return false;
+
+               /*
+                * If the tuple was concurrently updated and the caller of this
+                * function requested for the updated tuple, skip the trigger
+                * execution.
+                */
+               if (newSlot != NULL && epqslot != NULL)
+               {
+                       *epqslot = newSlot;
+                       heap_freetuple(trigtuple);
+                       return false;
+               }
         }
         else
                 trigtuple = fdw_trigtuple;
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c

index fad6df0aeb356b0ecc6aaa00b1e6df0eb9444ce3..9a7dedf5aa22b887e9fd547b4a333f4194b3651e 100644 (file)
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -531,7 +531,7 @@ ExecSimpleRelationDelete(EState *estate, EPQState *epqstate,
         {
                 skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
                                                                                    &searchslot->tts_tuple->t_self,
-                                                                                  NULL);
+                                                                                  NULL, NULL);
         }
  
         if (!skip_tuple)
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c

index 7e0b8679717a952822f39d884a62b60012092c7b..779b3d48940850b6df5f7a3b2e3fdc02c62f3f35 100644 (file)
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -609,7 +609,11 @@ ExecInsert(ModifyTableState *mtstate,
   *             foreign table, tupleid is invalid; the FDW has to figure out
   *             which row to delete using data from the planSlot.  oldtuple is
   *             passed to foreign table triggers; it is NULL when the foreign
- *             table has no relevant triggers.
+ *             table has no relevant triggers.  We use tupleDeleted to indicate
+ *             whether the tuple is actually deleted, callers can use it to
+ *             decide whether to continue the operation.  When this DELETE is a
+ *             part of an UPDATE of partition-key, then the slot returned by
+ *             EvalPlanQual() is passed back using output parameter epqslot.
   *
   *             Returns RETURNING result if any, otherwise NULL.
   * ----------------------------------------------------------------
@@ -621,10 +625,11 @@ ExecDelete(ModifyTableState *mtstate,
                    TupleTableSlot *planSlot,
                    EPQState *epqstate,
                    EState *estate,
-                  bool *tupleDeleted,
                    bool processReturning,
                    bool canSetTag,
-                  bool changingPart)
+                  bool changingPart,
+                  bool *tupleDeleted,
+                  TupleTableSlot **epqslot)
  {
         ResultRelInfo *resultRelInfo;
         Relation        resultRelationDesc;
@@ -649,7 +654,7 @@ ExecDelete(ModifyTableState *mtstate,
                 bool            dodelete;
  
                 dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
-                                                                               tupleid, oldtuple);
+                                                                               tupleid, oldtuple, epqslot);
  
                 if (!dodelete)                  /* "do nothing" */
                         return NULL;
@@ -769,19 +774,30 @@ ldelete:;
  
                                 if (!ItemPointerEquals(tupleid, &hufd.ctid))
                                 {
-                                       TupleTableSlot *epqslot;
-
-                                       epqslot = EvalPlanQual(estate,
-                                                                                  epqstate,
-                                                                                  resultRelationDesc,
-                                                                                  resultRelInfo->ri_RangeTableIndex,
-                                                                                  LockTupleExclusive,
-                                                                                  &hufd.ctid,
-                                                                                  hufd.xmax);
-                                       if (!TupIsNull(epqslot))
+                                       TupleTableSlot *my_epqslot;
+
+                                       my_epqslot = EvalPlanQual(estate,
+                                                                                         epqstate,
+                                                                                         resultRelationDesc,
+                                                                                         resultRelInfo->ri_RangeTableIndex,
+                                                                                         LockTupleExclusive,
+                                                                                         &hufd.ctid,
+                                                                                         hufd.xmax);
+                                       if (!TupIsNull(my_epqslot))
                                         {
                                                 *tupleid = hufd.ctid;
-                                               goto ldelete;
+
+                                               /*
+                                                * If requested, skip delete and pass back the updated
+                                                * row.
+                                                */
+                                               if (epqslot)
+                                               {
+                                                       *epqslot = my_epqslot;
+                                                       return NULL;
+                                               }
+                                               else
+                                                       goto ldelete;
                                         }
                                 }
                                 /* tuple already deleted; nothing to do */
@@ -1052,6 +1068,7 @@ lreplace:;
                 {
                         bool            tuple_deleted;
                         TupleTableSlot *ret_slot;
+                       TupleTableSlot *epqslot = NULL;
                         PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
                         int                     map_index;
                         TupleConversionMap *tupconv_map;
@@ -1081,8 +1098,8 @@ lreplace:;
                          * processing. We want to return rows from INSERT.
                          */
                         ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate,
-                                          estate, &tuple_deleted, false,
-                                          false /* canSetTag */ , true /* changingPart */ );
+                                          estate, false, false /* canSetTag */ ,
+                                          true /* changingPart */ , &tuple_deleted, &epqslot);
  
                         /*
                          * For some reason if DELETE didn't happen (e.g. trigger prevented
@@ -1105,7 +1122,23 @@ lreplace:;
                          * resurrect it.
                          */
                         if (!tuple_deleted)
-                               return NULL;
+                       {
+                               /*
+                                * epqslot will be typically NULL.  But when ExecDelete()
+                                * finds that another transaction has concurrently updated the
+                                * same row, it re-fetches the row, skips the delete, and
+                                * epqslot is set to the re-fetched tuple slot. In that case,
+                                * we need to do all the checks again.
+                                */
+                               if (TupIsNull(epqslot))
+                                       return NULL;
+                               else
+                               {
+                                       slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
+                                       tuple = ExecMaterializeSlot(slot);
+                                       goto lreplace;
+                               }
+                       }
  
                         /*
                          * Updates set the transition capture map only when a new subplan
@@ -2136,8 +2169,8 @@ ExecModifyTable(PlanState *pstate)
                         case CMD_DELETE:
                                 slot = ExecDelete(node, tupleid, oldtuple, planSlot,
                                                                   &node->mt_epqstate, estate,
-                                                                 NULL, true, node->canSetTag,
-                                                                 false /* changingPart */ );
+                                                                 true, node->canSetTag,
+                                                                 false /* changingPart */ , NULL, NULL);
                                 break;
                         default:
                                 elog(ERROR, "unknown operation");
diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h

index a5b8610fa22a1588f0a79fc46de8ca696f73c18f..1031448c1451b7d60e466caa4c5539d2985ef144 100644 (file)
--- a/src/include/commands/trigger.h
+++ b/src/include/commands/trigger.h
@@ -206,7 +206,8 @@ extern bool ExecBRDeleteTriggers(EState *estate,
                                          EPQState *epqstate,
                                          ResultRelInfo *relinfo,
                                          ItemPointer tupleid,
-                                        HeapTuple fdw_trigtuple);
+                                        HeapTuple fdw_trigtuple,
+                                        TupleTableSlot **epqslot);
  extern void ExecARDeleteTriggers(EState *estate,
                                          ResultRelInfo *relinfo,
                                          ItemPointer tupleid,
diff --git a/src/test/isolation/expected/partition-key-update-4.out b/src/test/isolation/expected/partition-key-update-4.out

new file mode 100644 (file)

index 0000000..774a7fa
--- /dev/null
+++ b/src/test/isolation/expected/partition-key-update-4.out
@@ -0,0 +1,60 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s2b s2u1 s1u s2c s1c s1s
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2u1: UPDATE foo SET b = b || ' update2' WHERE a = 1;
+step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1c: COMMIT;
+step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
+tableoid       a              b              
+
+foo2           2              ABC update2 update1
+
+starting permutation: s1b s2b s2ut1 s1ut s2c s1c s1st s1stl
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2ut1: UPDATE footrg SET b = b || ' update2' WHERE a = 1;
+step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1ut: <... completed>
+step s1c: COMMIT;
+step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
+tableoid       a              b              
+
+footrg2        2              ABC update2 update1
+step s1stl: SELECT * FROM triglog ORDER BY a;
+a              b              
+
+1              ABC update2 trigger
+
+starting permutation: s1b s2b s2u2 s1u s2c s1c s1s
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2u2: UPDATE foo SET b = 'EFG' WHERE a = 1;
+step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1c: COMMIT;
+step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
+tableoid       a              b              
+
+foo1           1              EFG            
+
+starting permutation: s1b s2b s2ut2 s1ut s2c s1c s1st s1stl
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2ut2: UPDATE footrg SET b = 'EFG' WHERE a = 1;
+step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1ut: <... completed>
+step s1c: COMMIT;
+step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
+tableoid       a              b              
+
+footrg1        1              EFG            
+step s1stl: SELECT * FROM triglog ORDER BY a;
+a              b              
+
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule

index 0e997215a8046ec052d6db688c116f34e5b66e4c..d5594e80e2312b860522fb7e4abc01d5108566d2 100644 (file)
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -74,4 +74,5 @@ test: predicate-gin-nomatch
  test: partition-key-update-1
  test: partition-key-update-2
  test: partition-key-update-3
+test: partition-key-update-4
  test: plpgsql-toast
diff --git a/src/test/isolation/specs/partition-key-update-4.spec b/src/test/isolation/specs/partition-key-update-4.spec

new file mode 100644 (file)

index 0000000..1d53a7d
--- /dev/null
+++ b/src/test/isolation/specs/partition-key-update-4.spec
@@ -0,0 +1,76 @@
+# Test that a row that ends up in a new partition contains changes made by
+# a concurrent transaction.
+
+setup
+{
+  --
+  -- Setup to test concurrent handling of ExecDelete().
+  --
+  CREATE TABLE foo (a int, b text) PARTITION BY LIST(a);
+  CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1);
+  CREATE TABLE foo2 PARTITION OF foo FOR VALUES IN (2);
+  INSERT INTO foo VALUES (1, 'ABC');
+
+  --
+  -- Setup to test concurrent handling of GetTupleForTrigger().
+  --
+  CREATE TABLE footrg (a int, b text) PARTITION BY LIST(a);
+  CREATE TABLE triglog as select * from footrg;
+  CREATE TABLE footrg1 PARTITION OF footrg FOR VALUES IN (1);
+  CREATE TABLE footrg2 PARTITION OF footrg FOR VALUES IN (2);
+  INSERT INTO footrg VALUES (1, 'ABC');
+  CREATE FUNCTION func_footrg() RETURNS TRIGGER AS $$
+  BEGIN
+        OLD.b = OLD.b || ' trigger';
+
+        -- This will verify that the trigger is not run *before* the row is
+        -- refetched by EvalPlanQual. The OLD row should contain the changes made
+        -- by the concurrent session.
+     INSERT INTO triglog select OLD.*;
+
+     RETURN OLD;
+  END $$ LANGUAGE PLPGSQL;
+  CREATE TRIGGER footrg_ondel BEFORE DELETE ON footrg1
+   FOR EACH ROW EXECUTE PROCEDURE func_footrg();
+
+}
+
+teardown
+{
+  DROP TABLE foo;
+  DROP TRIGGER footrg_ondel ON footrg1;
+  DROP FUNCTION func_footrg();
+  DROP TABLE footrg;
+  DROP TABLE triglog;
+}
+
+session "s1"
+step "s1b"      { BEGIN ISOLATION LEVEL READ COMMITTED; }
+step "s1u"   { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
+step "s1ut"  { UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
+step "s1s"   { SELECT tableoid::regclass, * FROM foo ORDER BY a; }
+step "s1st"  { SELECT tableoid::regclass, * FROM footrg ORDER BY a; }
+step "s1stl"  { SELECT * FROM triglog ORDER BY a; }
+step "s1c"      { COMMIT; }
+
+session "s2"
+step "s2b"      { BEGIN ISOLATION LEVEL READ COMMITTED; }
+step "s2u1"  { UPDATE foo SET b = b || ' update2' WHERE a = 1; }
+step "s2u2"  { UPDATE foo SET b = 'EFG' WHERE a = 1; }
+step "s2ut1" { UPDATE footrg SET b = b || ' update2' WHERE a = 1; }
+step "s2ut2" { UPDATE footrg SET b = 'EFG' WHERE a = 1; }
+step "s2c"      { COMMIT; }
+
+
+# Session s1 is moving a row into another partition, but is waiting for
+# another session s2 that is updating the original row. The row that ends up
+# in the new partition should contain the changes made by session s2.
+permutation "s1b" "s2b" "s2u1" "s1u" "s2c" "s1c" "s1s"
+
+# Same as above, except, session s1 is waiting in GetTupleTrigger().
+permutation "s1b" "s2b" "s2ut1" "s1ut" "s2c" "s1c" "s1st" "s1stl"
+
+# Below two cases are similar to the above two; except that the session s1
+# fails EvalPlanQual() test, so partition key update does not happen.
+permutation "s1b" "s2b" "s2u2" "s1u" "s2c" "s1c" "s1s"
+permutation "s1b" "s2b" "s2ut2" "s1ut" "s2c" "s1c" "s1st" "s1stl"
author	Amit Kapila <akapila@postgresql.org>
	Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)
committer	Amit Kapila <akapila@postgresql.org>
	Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)
src/backend/commands/trigger.c		patch \| blob \| history
src/backend/executor/execReplication.c		patch \| blob \| history
src/backend/executor/nodeModifyTable.c		patch \| blob \| history
src/include/commands/trigger.h		patch \| blob \| history
src/test/isolation/expected/partition-key-update-4.out	[new file with mode: 0644]	patch \| blob
src/test/isolation/isolation_schedule		patch \| blob \| history
src/test/isolation/specs/partition-key-update-4.spec	[new file with mode: 0644]	patch \| blob