]> granicus.if.org Git - postgresql/commitdiff
PANIC on fsync() failure.
authorThomas Munro <tmunro@postgresql.org>
Mon, 19 Nov 2018 00:37:59 +0000 (13:37 +1300)
committerThomas Munro <tmunro@postgresql.org>
Mon, 19 Nov 2018 00:37:59 +0000 (13:37 +1300)
On some operating systems, it doesn't make sense to retry fsync(),
because dirty data cached by the kernel may have been dropped on
write-back failure.  In that case the only remaining copy of the
data is in the WAL.  A subsequent fsync() could appear to succeed,
but not have flushed the data.  That means that a future checkpoint
could apparently complete successfully but have lost data.

Therefore, violently prevent any future checkpoint attempts by
panicking on the first fsync() failure.  Note that we already
did the same for WAL data; this change extends that behavior to
non-temporary data files.

Provide a GUC data_sync_retry to control this new behavior, for
users of operating systems that don't eject dirty data, and possibly
forensic/testing uses.  If it is set to on and the write-back error
was transient, a later checkpoint might genuinely succeed (on a
system that does not throw away buffers on failure); if the error is
permanent, later checkpoints will continue to fail.  The GUC defaults
to off, meaning that we panic.

Back-patch to all supported releases.

There is still a narrow window for error-loss on some operating
systems: if the file is closed and later reopened and a write-back
error occurs in the intervening time, but the inode has the bad
luck to be evicted due to memory pressure before we reopen, we could
miss the error.  A later patch will address that with a scheme
for keeping files with dirty data open at all times, but we judge
that to be too complicated to back-patch.

Author: Craig Ringer, with some adjustments by Thomas Munro
Reported-by: Craig Ringer
Reviewed-by: Robert Haas, Thomas Munro, Andres Freund
Discussion: https://postgr.es/m/20180427222842.in2e4mibx45zdth5%40alap3.anarazel.de

12 files changed:
doc/src/sgml/config.sgml
src/backend/access/heap/rewriteheap.c
src/backend/access/transam/slru.c
src/backend/access/transam/timeline.c
src/backend/access/transam/xlog.c
src/backend/replication/logical/snapbuild.c
src/backend/storage/file/fd.c
src/backend/storage/smgr/md.c
src/backend/utils/cache/relmapper.c
src/backend/utils/misc/guc.c
src/backend/utils/misc/postgresql.conf.sample
src/include/storage/fd.h

index bd282487a83dbc453c8cf2671e4a3d2c2ac928c5..cb529ae83439134da9537d38d35e46d165146e5b 100644 (file)
@@ -8127,6 +8127,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
+      <term><varname>data_sync_retry</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>data_sync_retry</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When set to false, which is the default, <productname>PostgreSQL</productname>
+        will raise a PANIC-level error on failure to flush modified data files
+        to the filesystem.  This causes the database server to crash.
+       </para>
+       <para>
+        On some operating systems, the status of data in the kernel's page
+        cache is unknown after a write-back failure.  In some cases it might
+        have been entirely forgotten, making it unsafe to retry; the second
+        attempt may be reported as successful, when in fact the data has been
+        lost.  In these circumstances, the only way to avoid data loss is to
+        recover from the WAL after any failure is reported, preferably
+        after investigating the root cause of the failure and replacing any
+        faulty hardware.
+       </para>
+       <para>
+        If set to true, <productname>PostgreSQL</productname> will instead
+        report an error but continue to run so that the data flushing
+        operation can be retried in a later checkpoint.  Only set it to true
+        after investigating the operating system's treatment of buffered data
+        in case of write-back failure.
+       </para>
+      </listitem>
+     </varlistentry>
+
     </variablelist>
 
    </sect1>
index 71277889649938ad2330fa35339880adc60da065..36139600377ca9463fc1066d02560fff039e3e4c 100644 (file)
@@ -978,7 +978,7 @@ logical_end_heap_rewrite(RewriteState state)
        while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
        {
                if (FileSync(src->vfd, WAIT_EVENT_LOGICAL_REWRITE_SYNC) != 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                        (errcode_for_file_access(),
                                         errmsg("could not fsync file \"%s\": %m", src->path)));
                FileClose(src->vfd);
@@ -1199,7 +1199,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
         */
        pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC);
        if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                (errcode_for_file_access(),
                                 errmsg("could not fsync file \"%s\": %m", path)));
        pgstat_report_wait_end();
@@ -1298,7 +1298,7 @@ CheckPointLogicalRewriteHeap(void)
                         */
                        pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC);
                        if (pg_fsync(fd) != 0)
-                               ereport(ERROR,
+                               ereport(data_sync_elevel(ERROR),
                                                (errcode_for_file_access(),
                                                 errmsg("could not fsync file \"%s\": %m", path)));
                        pgstat_report_wait_end();
index 1132eef0384a0767b056cfb9c1339e88bdd72869..fad5d363e32ff9d636371ea74f71b1808d9b8ab3 100644 (file)
@@ -928,7 +928,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
                                                           path, offset)));
                        break;
                case SLRU_FSYNC_FAILED:
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                        (errcode_for_file_access(),
                                         errmsg("could not access status of transaction %u", xid),
                                         errdetail("Could not fsync file \"%s\": %m.",
index 61d36050c34212d5c289e0d9c05ef05613668c5a..70eec5676eb299977590f58140c20c63025fcd24 100644 (file)
@@ -406,7 +406,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 
        pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
        if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                (errcode_for_file_access(),
                                 errmsg("could not fsync file \"%s\": %m", tmppath)));
        pgstat_report_wait_end();
@@ -485,7 +485,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
 
        pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
        if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                (errcode_for_file_access(),
                                 errmsg("could not fsync file \"%s\": %m", tmppath)));
        pgstat_report_wait_end();
index e941d7d253dc863cfd25249b657f37c68028a7d8..bc9024847b0a72a0438fd322cf55e543903d6d8c 100644 (file)
@@ -3468,7 +3468,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 
        pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
        if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                (errcode_for_file_access(),
                                 errmsg("could not fsync file \"%s\": %m", tmppath)));
        pgstat_report_wait_end();
index ea4541819466d42febcd1a8608dbad110c810059..9e96814f370812c341089b738aa9bdedcfeea7f8 100644 (file)
@@ -1629,6 +1629,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
         * fsync the file before renaming so that even if we crash after this we
         * have either a fully valid file or nothing.
         *
+        * It's safe to just ERROR on fsync() here because we'll retry the whole
+        * operation including the writes.
+        *
         * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
         * some noticeable overhead since it's performed synchronously during
         * decoding?
index 8dd51f176749c5d1b2adc9993b5a9a3571ee566c..0f8b1bb4501e58b5c67ce07231c85012930dfd82 100644 (file)
@@ -145,6 +145,8 @@ int                 max_files_per_process = 1000;
  */
 int                    max_safe_fds = 32;      /* default if not changed */
 
+/* Whether it is safe to continue running after fsync() fails. */
+bool           data_sync_retry = false;
 
 /* Debugging.... */
 
@@ -442,11 +444,9 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
                 */
                rc = sync_file_range(fd, offset, nbytes,
                                                         SYNC_FILE_RANGE_WRITE);
-
-               /* don't error out, this is just a performance optimization */
                if (rc != 0)
                {
-                       ereport(WARNING,
+                       ereport(data_sync_elevel(WARNING),
                                        (errcode_for_file_access(),
                                         errmsg("could not flush dirty data: %m")));
                }
@@ -518,7 +518,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
                        rc = msync(p, (size_t) nbytes, MS_ASYNC);
                        if (rc != 0)
                        {
-                               ereport(WARNING,
+                               ereport(data_sync_elevel(WARNING),
                                                (errcode_for_file_access(),
                                                 errmsg("could not flush dirty data: %m")));
                                /* NB: need to fall through to munmap()! */
@@ -574,7 +574,7 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
 void
 fsync_fname(const char *fname, bool isdir)
 {
-       fsync_fname_ext(fname, isdir, false, ERROR);
+       fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
 }
 
 /*
@@ -1050,7 +1050,8 @@ LruDelete(File file)
         * to leak the FD than to mess up our internal state.
         */
        if (close(vfdP->fd))
-               elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+               elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+                        "could not close file \"%s\": %m", vfdP->fileName);
        vfdP->fd = VFD_CLOSED;
        --nfile;
 
@@ -1754,7 +1755,14 @@ FileClose(File file)
        {
                /* close the file */
                if (close(vfdP->fd))
-                       elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+               {
+                       /*
+                        * We may need to panic on failure to close non-temporary files;
+                        * see LruDelete.
+                        */
+                       elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+                               "could not close file \"%s\": %m", vfdP->fileName);
+               }
 
                --nfile;
                vfdP->fd = VFD_CLOSED;
@@ -3250,6 +3258,9 @@ looks_like_temp_rel_name(const char *name)
  * harmless cases such as read-only files in the data directory, and that's
  * not good either.
  *
+ * Note that if we previously crashed due to a PANIC on fsync(), we'll be
+ * rewriting all changes again during recovery.
+ *
  * Note we assume we're chdir'd into PGDATA to begin with.
  */
 void
@@ -3572,3 +3583,26 @@ MakePGDirectory(const char *directoryName)
 {
        return mkdir(directoryName, pg_dir_create_mode);
 }
+
+/*
+ * Return the passed-in error level, or PANIC if data_sync_retry is off.
+ *
+ * Failure to fsync any data file is cause for immediate panic, unless
+ * data_sync_retry is enabled.  Data may have been written to the operating
+ * system and removed from our buffer pool already, and if we are running on
+ * an operating system that forgets dirty data on write-back failure, there
+ * may be only one copy of the data remaining: in the WAL.  A later attempt to
+ * fsync again might falsely report success.  Therefore we must not allow any
+ * further checkpoints to be attempted.  data_sync_retry can in theory be
+ * enabled on systems known not to drop dirty buffered data on write-back
+ * failure (with the likely outcome that checkpoints will continue to fail
+ * until the underlying problem is fixed).
+ *
+ * Any code that reports a failure from fsync() or related functions should
+ * filter the error level with this function.
+ */
+int
+data_sync_elevel(int elevel)
+{
+       return data_sync_retry ? elevel : PANIC;
+}
index f19aedbdd0e27b561db6b112e3581b5853e56a19..10e0271473f1583588a7646652d58356642dfbd9 100644 (file)
@@ -1039,7 +1039,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
                MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
 
                if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                        (errcode_for_file_access(),
                                         errmsg("could not fsync file \"%s\": %m",
                                                        FilePathName(v->mdfd_vfd))));
@@ -1284,7 +1284,7 @@ mdsync(void)
                                                        bms_join(new_requests, requests);
 
                                                errno = save_errno;
-                                               ereport(ERROR,
+                                               ereport(data_sync_elevel(ERROR),
                                                                (errcode_for_file_access(),
                                                                 errmsg("could not fsync file \"%s\": %m",
                                                                                path)));
@@ -1458,7 +1458,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
                                (errmsg("could not forward fsync request because request queue is full")));
 
                if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                        (errcode_for_file_access(),
                                         errmsg("could not fsync file \"%s\": %m",
                                                        FilePathName(seg->mdfd_vfd))));
index e3a5e4e386e70e1c56e477f998f51ee431656ad5..da3d9e828c0c241b34ace8857246c092e72c17b6 100644 (file)
@@ -799,7 +799,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
         */
        pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
        if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                (errcode_for_file_access(),
                                 errmsg("could not fsync relation mapping file \"%s\": %m",
                                                mapfilename)));
index 7497a282bfb84c057e46b81b6b09ddf31f88ead8..9a7a0b045a9322ad27faa516697a722d29bffeee 100644 (file)
@@ -1824,6 +1824,15 @@ static struct config_bool ConfigureNamesBool[] =
                NULL, NULL, NULL
        },
 
+       {
+               {"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
+                       gettext_noop("Whether to continue running after a failure to sync data files."),
+               },
+               &data_sync_retry,
+               false,
+               NULL, NULL, NULL
+       },
+
        /* End-of-list marker */
        {
                {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
index 14527634bab34474c1c452db239741f5bad0eee9..1404946c410ac50fe5ebeea57cc4f9f78e616fae 100644 (file)
 
 #exit_on_error = off                   # terminate session on any error?
 #restart_after_crash = on              # reinitialize after backend crash?
+#data_sync_retry = off                 # retry or panic on failure to fsync data?
 
 
 #------------------------------------------------------------------------------
index 8e7c9728f4b3c3c969d8ec15617854e51df4bb12..4b9d1312c26c71bc74b804b248cd26b3fba7162e 100644 (file)
@@ -51,6 +51,7 @@ typedef int File;
 
 /* GUC parameter */
 extern PGDLLIMPORT int max_files_per_process;
+extern PGDLLIMPORT bool data_sync_retry;
 
 /*
  * This is private to fd.c, but exported for save/restore_backend_variables()
@@ -138,6 +139,7 @@ extern int  durable_rename(const char *oldfile, const char *newfile, int loglevel
 extern int     durable_unlink(const char *fname, int loglevel);
 extern int     durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
 extern void SyncDataDirectory(void);
+extern int data_sync_elevel(int elevel);
 
 /* Filename components */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"