]> granicus.if.org Git - postgresql/commitdiff
Fix bug in cancellation of non-exclusive backup to avoid assertion failure.
authorFujii Masao <fujii@postgresql.org>
Mon, 18 Dec 2017 18:46:14 +0000 (03:46 +0900)
committerFujii Masao <fujii@postgresql.org>
Mon, 18 Dec 2017 18:49:25 +0000 (03:49 +0900)
Previously an assertion failure occurred when pg_stop_backup() for
non-exclusive backup was aborted while it's waiting for WAL files to
be archived. This assertion failure happened in do_pg_abort_backup()
which was called when a non-exclusive backup was canceled.
do_pg_abort_backup() assumes that there is at least one non-exclusive
backup running when it's called. But pg_stop_backup() can be canceled
even after it marks the end of non-exclusive backup (e.g.,
during waiting for WAL archiving). This broke the assumption that
do_pg_abort_backup() relies on, and which caused an assertion failure.

This commit changes do_pg_abort_backup() so that it does nothing
when non-exclusive backup has been already marked as completed.
That is, the asssumption is also changed, and do_pg_abort_backup()
now can handle even the case where it's called when there is
no running backup.

Backpatch to 9.6 where SQL-callable non-exclusive backup was added.

Author: Masahiko Sawada and Michael Paquier
Reviewed-By: Robert Haas and Fujii Masao
Discussion: https://www.postgresql.org/message-id/CAD21AoD2L1Fu2c==gnVASMyFAAaq3y-AQ2uEVj-zTCGFFjvmDg@mail.gmail.com

src/backend/access/transam/xlog.c
src/backend/replication/basebackup.c

index 85470f0bd65d24968db3e1e227d3603cbf0272ce..1b084a3a752f27a77adba50234a9ea31a52bdb63 100644 (file)
@@ -10275,13 +10275,20 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
        /*
         * Mark that start phase has correctly finished for an exclusive backup.
         * Session-level locks are updated as well to reflect that state.
+        *
+        * Note that CHECK_FOR_INTERRUPTS() must not occur while updating
+        * backup counters and session-level lock. Otherwise they can be
+        * updated inconsistently, and which might cause do_pg_abort_backup()
+        * to fail.
         */
        if (exclusive)
        {
                WALInsertLockAcquireExclusive();
                XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
-               WALInsertLockRelease();
+
+               /* Set session-level lock */
                sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
+               WALInsertLockRelease();
        }
        else
                sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
@@ -10489,7 +10496,11 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
        }
 
        /*
-        * OK to update backup counters and forcePageWrites
+        * OK to update backup counters, forcePageWrites and session-level lock.
+        *
+        * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
+        * Otherwise they can be updated inconsistently, and which might cause
+        * do_pg_abort_backup() to fail.
         */
        WALInsertLockAcquireExclusive();
        if (exclusive)
@@ -10513,11 +10524,20 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
        {
                XLogCtl->Insert.forcePageWrites = false;
        }
-       WALInsertLockRelease();
 
-       /* Clean up session-level lock */
+       /*
+        * Clean up session-level lock.
+        *
+        * You might think that WALInsertLockRelease() can be called
+        * before cleaning up session-level lock because session-level
+        * lock doesn't need to be protected with WAL insertion lock.
+        * But since CHECK_FOR_INTERRUPTS() can occur in it,
+        * session-level lock must be cleaned up before it.
+        */
        sessionBackupState = SESSION_BACKUP_NONE;
 
+       WALInsertLockRelease();
+
        /*
         * Read and parse the START WAL LOCATION line (this code is pretty crude,
         * but we are not expecting any variability in the file format).
@@ -10750,8 +10770,16 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
 void
 do_pg_abort_backup(void)
 {
+       /*
+        * Quick exit if session is not keeping around a non-exclusive backup
+        * already started.
+        */
+       if (sessionBackupState == SESSION_BACKUP_NONE)
+               return;
+
        WALInsertLockAcquireExclusive();
        Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+       Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
        XLogCtl->Insert.nonExclusiveBackups--;
 
        if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
index 87a753787fd1bb38c053e6885cb319ad16930a9c..53be5d55e49ca4ea8c30eddc4031a94953335b42 100644 (file)
@@ -140,7 +140,7 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
         * Once do_pg_start_backup has been called, ensure that any failure causes
         * us to abort the backup so we don't "leak" a backup counter. For this
         * reason, *all* functionality between do_pg_start_backup() and
-        * do_pg_stop_backup() should be inside the error cleanup block!
+        * the end of do_pg_stop_backup() should be inside the error cleanup block!
         */
 
        PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
@@ -249,10 +249,11 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
                        else
                                pq_putemptymessage('c');                /* CopyDone */
                }
+
+               endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli);
        }
        PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 
-       endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli);
 
        if (opt->includewal)
        {