]> granicus.if.org Git - postgresql/commitdiff
Allow using the updated tuple while moving it to a different partition.
authorAmit Kapila <akapila@postgresql.org>
Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)
committerAmit Kapila <akapila@postgresql.org>
Thu, 12 Jul 2018 06:47:27 +0000 (12:17 +0530)
An update that causes the tuple to be moved to a different partition was
missing out on re-constructing the to-be-updated tuple, based on the latest
tuple in the update chain.  Instead, it's simply deleting the latest tuple
and inserting a new tuple in the new partition based on the old tuple.
Commit 2f17844104 didn't consider this case, so some of the updates were
getting lost.

In passing, change the argument order for output parameter in ExecDelete
and add some commentary about it.

Reported-by: Pavan Deolasee
Author: Amit Khandekar, with minor changes by me
Reviewed-by: Dilip Kumar, Amit Kapila and Alvaro Herrera
Backpatch-through: 11
Discussion: https://postgr.es/m/CAJ3gD9fRbEzDqdeDq1jxqZUb47kJn+tQ7=Bcgjc8quqKsDViKQ@mail.gmail.com

src/backend/commands/trigger.c
src/backend/executor/execReplication.c
src/backend/executor/nodeModifyTable.c
src/include/commands/trigger.h
src/test/isolation/expected/partition-key-update-4.out [new file with mode: 0644]
src/test/isolation/isolation_schedule
src/test/isolation/specs/partition-key-update-4.spec [new file with mode: 0644]

index 57519fe8d64751a4ea18d0e43f098272f5a8159d..2436692eb859dc5b84b9b178be375c5e1e62b47d 100644 (file)
@@ -2726,11 +2726,19 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
                                                          false, NULL, NULL, NIL, NULL, transition_capture);
 }
 
+/*
+ * Execute BEFORE ROW DELETE triggers.
+ *
+ * True indicates caller can proceed with the delete.  False indicates caller
+ * need to suppress the delete and additionally if requested, we need to pass
+ * back the concurrently updated tuple if any.
+ */
 bool
 ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
                                         ResultRelInfo *relinfo,
                                         ItemPointer tupleid,
-                                        HeapTuple fdw_trigtuple)
+                                        HeapTuple fdw_trigtuple,
+                                        TupleTableSlot **epqslot)
 {
        TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
        bool            result = true;
@@ -2747,6 +2755,18 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
                                                                           LockTupleExclusive, &newSlot);
                if (trigtuple == NULL)
                        return false;
+
+               /*
+                * If the tuple was concurrently updated and the caller of this
+                * function requested for the updated tuple, skip the trigger
+                * execution.
+                */
+               if (newSlot != NULL && epqslot != NULL)
+               {
+                       *epqslot = newSlot;
+                       heap_freetuple(trigtuple);
+                       return false;
+               }
        }
        else
                trigtuple = fdw_trigtuple;
index fad6df0aeb356b0ecc6aaa00b1e6df0eb9444ce3..9a7dedf5aa22b887e9fd547b4a333f4194b3651e 100644 (file)
@@ -531,7 +531,7 @@ ExecSimpleRelationDelete(EState *estate, EPQState *epqstate,
        {
                skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
                                                                                   &searchslot->tts_tuple->t_self,
-                                                                                  NULL);
+                                                                                  NULL, NULL);
        }
 
        if (!skip_tuple)
index 7e0b8679717a952822f39d884a62b60012092c7b..779b3d48940850b6df5f7a3b2e3fdc02c62f3f35 100644 (file)
@@ -609,7 +609,11 @@ ExecInsert(ModifyTableState *mtstate,
  *             foreign table, tupleid is invalid; the FDW has to figure out
  *             which row to delete using data from the planSlot.  oldtuple is
  *             passed to foreign table triggers; it is NULL when the foreign
- *             table has no relevant triggers.
+ *             table has no relevant triggers.  We use tupleDeleted to indicate
+ *             whether the tuple is actually deleted, callers can use it to
+ *             decide whether to continue the operation.  When this DELETE is a
+ *             part of an UPDATE of partition-key, then the slot returned by
+ *             EvalPlanQual() is passed back using output parameter epqslot.
  *
  *             Returns RETURNING result if any, otherwise NULL.
  * ----------------------------------------------------------------
@@ -621,10 +625,11 @@ ExecDelete(ModifyTableState *mtstate,
                   TupleTableSlot *planSlot,
                   EPQState *epqstate,
                   EState *estate,
-                  bool *tupleDeleted,
                   bool processReturning,
                   bool canSetTag,
-                  bool changingPart)
+                  bool changingPart,
+                  bool *tupleDeleted,
+                  TupleTableSlot **epqslot)
 {
        ResultRelInfo *resultRelInfo;
        Relation        resultRelationDesc;
@@ -649,7 +654,7 @@ ExecDelete(ModifyTableState *mtstate,
                bool            dodelete;
 
                dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
-                                                                               tupleid, oldtuple);
+                                                                               tupleid, oldtuple, epqslot);
 
                if (!dodelete)                  /* "do nothing" */
                        return NULL;
@@ -769,19 +774,30 @@ ldelete:;
 
                                if (!ItemPointerEquals(tupleid, &hufd.ctid))
                                {
-                                       TupleTableSlot *epqslot;
-
-                                       epqslot = EvalPlanQual(estate,
-                                                                                  epqstate,
-                                                                                  resultRelationDesc,
-                                                                                  resultRelInfo->ri_RangeTableIndex,
-                                                                                  LockTupleExclusive,
-                                                                                  &hufd.ctid,
-                                                                                  hufd.xmax);
-                                       if (!TupIsNull(epqslot))
+                                       TupleTableSlot *my_epqslot;
+
+                                       my_epqslot = EvalPlanQual(estate,
+                                                                                         epqstate,
+                                                                                         resultRelationDesc,
+                                                                                         resultRelInfo->ri_RangeTableIndex,
+                                                                                         LockTupleExclusive,
+                                                                                         &hufd.ctid,
+                                                                                         hufd.xmax);
+                                       if (!TupIsNull(my_epqslot))
                                        {
                                                *tupleid = hufd.ctid;
-                                               goto ldelete;
+
+                                               /*
+                                                * If requested, skip delete and pass back the updated
+                                                * row.
+                                                */
+                                               if (epqslot)
+                                               {
+                                                       *epqslot = my_epqslot;
+                                                       return NULL;
+                                               }
+                                               else
+                                                       goto ldelete;
                                        }
                                }
                                /* tuple already deleted; nothing to do */
@@ -1052,6 +1068,7 @@ lreplace:;
                {
                        bool            tuple_deleted;
                        TupleTableSlot *ret_slot;
+                       TupleTableSlot *epqslot = NULL;
                        PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
                        int                     map_index;
                        TupleConversionMap *tupconv_map;
@@ -1081,8 +1098,8 @@ lreplace:;
                         * processing. We want to return rows from INSERT.
                         */
                        ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate,
-                                          estate, &tuple_deleted, false,
-                                          false /* canSetTag */ , true /* changingPart */ );
+                                          estate, false, false /* canSetTag */ ,
+                                          true /* changingPart */ , &tuple_deleted, &epqslot);
 
                        /*
                         * For some reason if DELETE didn't happen (e.g. trigger prevented
@@ -1105,7 +1122,23 @@ lreplace:;
                         * resurrect it.
                         */
                        if (!tuple_deleted)
-                               return NULL;
+                       {
+                               /*
+                                * epqslot will be typically NULL.  But when ExecDelete()
+                                * finds that another transaction has concurrently updated the
+                                * same row, it re-fetches the row, skips the delete, and
+                                * epqslot is set to the re-fetched tuple slot. In that case,
+                                * we need to do all the checks again.
+                                */
+                               if (TupIsNull(epqslot))
+                                       return NULL;
+                               else
+                               {
+                                       slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
+                                       tuple = ExecMaterializeSlot(slot);
+                                       goto lreplace;
+                               }
+                       }
 
                        /*
                         * Updates set the transition capture map only when a new subplan
@@ -2136,8 +2169,8 @@ ExecModifyTable(PlanState *pstate)
                        case CMD_DELETE:
                                slot = ExecDelete(node, tupleid, oldtuple, planSlot,
                                                                  &node->mt_epqstate, estate,
-                                                                 NULL, true, node->canSetTag,
-                                                                 false /* changingPart */ );
+                                                                 true, node->canSetTag,
+                                                                 false /* changingPart */ , NULL, NULL);
                                break;
                        default:
                                elog(ERROR, "unknown operation");
index a5b8610fa22a1588f0a79fc46de8ca696f73c18f..1031448c1451b7d60e466caa4c5539d2985ef144 100644 (file)
@@ -206,7 +206,8 @@ extern bool ExecBRDeleteTriggers(EState *estate,
                                         EPQState *epqstate,
                                         ResultRelInfo *relinfo,
                                         ItemPointer tupleid,
-                                        HeapTuple fdw_trigtuple);
+                                        HeapTuple fdw_trigtuple,
+                                        TupleTableSlot **epqslot);
 extern void ExecARDeleteTriggers(EState *estate,
                                         ResultRelInfo *relinfo,
                                         ItemPointer tupleid,
diff --git a/src/test/isolation/expected/partition-key-update-4.out b/src/test/isolation/expected/partition-key-update-4.out
new file mode 100644 (file)
index 0000000..774a7fa
--- /dev/null
@@ -0,0 +1,60 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1b s2b s2u1 s1u s2c s1c s1s
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2u1: UPDATE foo SET b = b || ' update2' WHERE a = 1;
+step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1c: COMMIT;
+step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
+tableoid       a              b              
+
+foo2           2              ABC update2 update1
+
+starting permutation: s1b s2b s2ut1 s1ut s2c s1c s1st s1stl
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2ut1: UPDATE footrg SET b = b || ' update2' WHERE a = 1;
+step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1ut: <... completed>
+step s1c: COMMIT;
+step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
+tableoid       a              b              
+
+footrg2        2              ABC update2 update1
+step s1stl: SELECT * FROM triglog ORDER BY a;
+a              b              
+
+1              ABC update2 trigger
+
+starting permutation: s1b s2b s2u2 s1u s2c s1c s1s
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2u2: UPDATE foo SET b = 'EFG' WHERE a = 1;
+step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1u: <... completed>
+step s1c: COMMIT;
+step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a;
+tableoid       a              b              
+
+foo1           1              EFG            
+
+starting permutation: s1b s2b s2ut2 s1ut s2c s1c s1st s1stl
+step s1b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2b: BEGIN ISOLATION LEVEL READ COMMITTED;
+step s2ut2: UPDATE footrg SET b = 'EFG' WHERE a = 1;
+step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; <waiting ...>
+step s2c: COMMIT;
+step s1ut: <... completed>
+step s1c: COMMIT;
+step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a;
+tableoid       a              b              
+
+footrg1        1              EFG            
+step s1stl: SELECT * FROM triglog ORDER BY a;
+a              b              
+
index 0e997215a8046ec052d6db688c116f34e5b66e4c..d5594e80e2312b860522fb7e4abc01d5108566d2 100644 (file)
@@ -74,4 +74,5 @@ test: predicate-gin-nomatch
 test: partition-key-update-1
 test: partition-key-update-2
 test: partition-key-update-3
+test: partition-key-update-4
 test: plpgsql-toast
diff --git a/src/test/isolation/specs/partition-key-update-4.spec b/src/test/isolation/specs/partition-key-update-4.spec
new file mode 100644 (file)
index 0000000..1d53a7d
--- /dev/null
@@ -0,0 +1,76 @@
+# Test that a row that ends up in a new partition contains changes made by
+# a concurrent transaction.
+
+setup
+{
+  --
+  -- Setup to test concurrent handling of ExecDelete().
+  --
+  CREATE TABLE foo (a int, b text) PARTITION BY LIST(a);
+  CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1);
+  CREATE TABLE foo2 PARTITION OF foo FOR VALUES IN (2);
+  INSERT INTO foo VALUES (1, 'ABC');
+
+  --
+  -- Setup to test concurrent handling of GetTupleForTrigger().
+  --
+  CREATE TABLE footrg (a int, b text) PARTITION BY LIST(a);
+  CREATE TABLE triglog as select * from footrg;
+  CREATE TABLE footrg1 PARTITION OF footrg FOR VALUES IN (1);
+  CREATE TABLE footrg2 PARTITION OF footrg FOR VALUES IN (2);
+  INSERT INTO footrg VALUES (1, 'ABC');
+  CREATE FUNCTION func_footrg() RETURNS TRIGGER AS $$
+  BEGIN
+        OLD.b = OLD.b || ' trigger';
+
+        -- This will verify that the trigger is not run *before* the row is
+        -- refetched by EvalPlanQual. The OLD row should contain the changes made
+        -- by the concurrent session.
+     INSERT INTO triglog select OLD.*;
+
+     RETURN OLD;
+  END $$ LANGUAGE PLPGSQL;
+  CREATE TRIGGER footrg_ondel BEFORE DELETE ON footrg1
+   FOR EACH ROW EXECUTE PROCEDURE func_footrg();
+
+}
+
+teardown
+{
+  DROP TABLE foo;
+  DROP TRIGGER footrg_ondel ON footrg1;
+  DROP FUNCTION func_footrg();
+  DROP TABLE footrg;
+  DROP TABLE triglog;
+}
+
+session "s1"
+step "s1b"      { BEGIN ISOLATION LEVEL READ COMMITTED; }
+step "s1u"   { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
+step "s1ut"  { UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; }
+step "s1s"   { SELECT tableoid::regclass, * FROM foo ORDER BY a; }
+step "s1st"  { SELECT tableoid::regclass, * FROM footrg ORDER BY a; }
+step "s1stl"  { SELECT * FROM triglog ORDER BY a; }
+step "s1c"      { COMMIT; }
+
+session "s2"
+step "s2b"      { BEGIN ISOLATION LEVEL READ COMMITTED; }
+step "s2u1"  { UPDATE foo SET b = b || ' update2' WHERE a = 1; }
+step "s2u2"  { UPDATE foo SET b = 'EFG' WHERE a = 1; }
+step "s2ut1" { UPDATE footrg SET b = b || ' update2' WHERE a = 1; }
+step "s2ut2" { UPDATE footrg SET b = 'EFG' WHERE a = 1; }
+step "s2c"      { COMMIT; }
+
+
+# Session s1 is moving a row into another partition, but is waiting for
+# another session s2 that is updating the original row. The row that ends up
+# in the new partition should contain the changes made by session s2.
+permutation "s1b" "s2b" "s2u1" "s1u" "s2c" "s1c" "s1s"
+
+# Same as above, except, session s1 is waiting in GetTupleTrigger().
+permutation "s1b" "s2b" "s2ut1" "s1ut" "s2c" "s1c" "s1st" "s1stl"
+
+# Below two cases are similar to the above two; except that the session s1
+# fails EvalPlanQual() test, so partition key update does not happen.
+permutation "s1b" "s2b" "s2u2" "s1u" "s2c" "s1c" "s1s"
+permutation "s1b" "s2b" "s2ut2" "s1ut" "s2c" "s1c" "s1st" "s1stl"