PANIC on fsync() failure.

author Thomas Munro <tmunro@postgresql.org>

Mon, 19 Nov 2018 00:40:57 +0000 (13:40 +1300)

committer Thomas Munro <tmunro@postgresql.org>

Mon, 19 Nov 2018 01:26:28 +0000 (14:26 +1300)
author Thomas Munro <tmunro@postgresql.org>
Mon, 19 Nov 2018 00:40:57 +0000 (13:40 +1300)
committer Thomas Munro <tmunro@postgresql.org>
Mon, 19 Nov 2018 01:26:28 +0000 (14:26 +1300)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 34a2e295047702b84bd45378d4198c27b56d3a74..32830e09cc125b6f4984e63ff1e022107c940895 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6829,6 +6829,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
        </listitem>
       </varlistentry>
  
+     <varlistentry id="guc-data-sync-retry" xreflabel="data_sync_retry">
+      <term><varname>data_sync_retry</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>data_sync_retry</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When set to false, which is the default, <productname>PostgreSQL</productname>
+        will raise a PANIC-level error on failure to flush modified data files
+        to the filesystem.  This causes the database server to crash.
+       </para>
+       <para>
+        On some operating systems, the status of data in the kernel's page
+        cache is unknown after a write-back failure.  In some cases it might
+        have been entirely forgotten, making it unsafe to retry; the second
+        attempt may be reported as successful, when in fact the data has been
+        lost.  In these circumstances, the only way to avoid data loss is to
+        recover from the WAL after any failure is reported, preferably
+        after investigating the root cause of the failure and replacing any
+        faulty hardware.
+       </para>
+       <para>
+        If set to true, <productname>PostgreSQL</productname> will instead
+        report an error but continue to run so that the data flushing
+        operation can be retried in a later checkpoint.  Only set it to true
+        after investigating the operating system's treatment of buffered data
+        in case of write-back failure.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
  
     </sect1>
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c

index b330c6c651f81cc65dcb2a4256ac3b2f4f53d68e..5c206c4c294314289a98ac25721ed0716305ee99 100644 (file)
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -984,7 +984,7 @@ logical_end_heap_rewrite(RewriteState state)
         while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
         {
                 if (FileSync(src->vfd) != 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                         (errcode_for_file_access(),
                                          errmsg("could not fsync file \"%s\": %m", src->path)));
                 FileClose(src->vfd);
@@ -1202,7 +1202,7 @@ heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r)
          * doesn't seem worth the trouble.
          */
         if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                 (errcode_for_file_access(),
                                  errmsg("could not fsync file \"%s\": %m", path)));
  
@@ -1299,7 +1299,7 @@ CheckPointLogicalRewriteHeap(void)
                          * but it's currently not deemed worth the effort.
                          */
                         else if (pg_fsync(fd) != 0)
-                               ereport(ERROR,
+                               ereport(data_sync_elevel(ERROR),
                                                 (errcode_for_file_access(),
                                                  errmsg("could not fsync file \"%s\": %m", path)));
                         CloseTransientFile(fd);
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c

index bf8aa7b9ea4b408eab52cffab72b31a5ddf94082..c26a4e2b69114cfd096ffa3c30b50b1779958941 100644 (file)
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -903,7 +903,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
                                                   path, offset)));
                         break;
                 case SLRU_FSYNC_FAILED:
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                         (errcode_for_file_access(),
                                          errmsg("could not access status of transaction %u", xid),
                                          errdetail("Could not fsync file \"%s\": %m.",
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c

index 4c1d4336c7afa513c3ece1d092ea5aa0f5af60a7..203e45d9d0ee545a7053894c9f563bbd2100c472 100644 (file)
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -402,7 +402,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
         }
  
         if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                 (errcode_for_file_access(),
                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
  
@@ -478,7 +478,7 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
         }
  
         if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                 (errcode_for_file_access(),
                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
  
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 5e3b6caecf5559ad83305bf4162caa4d7c61cdeb..4c888898b94257dd8daad7dea0aa6e1524044417 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3388,7 +3388,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
         }
  
         if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                 (errcode_for_file_access(),
                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
  
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c

index 37a3d77a4c907c2b1e324cb8bc1f7a0e80746b98..8ce99f4af969e34bb4ec44c46bf30a52b5120f9d 100644 (file)
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1589,6 +1589,9 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
          * fsync the file before renaming so that even if we crash after this we
          * have either a fully valid file or nothing.
          *
+        * It's safe to just ERROR on fsync() here because we'll retry the whole
+        * operation including the writes.
+        *
          * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
          * some noticeable overhead since it's performed synchronously during
          * decoding?
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c

index 66226cbaffe78f75564608850e7f899fa36bd91a..df13a3e367c8d68a98ea83dfbe5f8944ef567e6a 100644 (file)
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -130,6 +130,8 @@ int                 max_files_per_process = 1000;
   */
  int                    max_safe_fds = 32;      /* default if not changed */
  
+/* Whether it is safe to continue running after fsync() fails. */
+bool           data_sync_retry = false;
  
  /* Debugging.... */
  
@@ -423,7 +425,7 @@ pg_flush_data(int fd, off_t offset, off_t amount)
  void
  fsync_fname(const char *fname, bool isdir)
  {
-       fsync_fname_ext(fname, isdir, false, ERROR);
+       fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
  }
  
  /*
@@ -852,7 +854,8 @@ LruDelete(File file)
          * to leak the FD than to mess up our internal state.
          */
         if (close(vfdP->fd))
-               elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+               elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
+                        "could not close file \"%s\": %m", vfdP->fileName);
         vfdP->fd = VFD_CLOSED;
         --nfile;
  
@@ -1331,7 +1334,14 @@ FileClose(File file)
         {
                 /* close the file */
                 if (close(vfdP->fd))
-                       elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
+               {
+                       /*
+                        * We may need to panic on failure to close non-temporary files;
+                        * see LruDelete.
+                        */
+                       elog(vfdP->fdstate & FD_TEMPORARY ? LOG : data_sync_elevel(LOG),
+                               "could not close file \"%s\": %m", vfdP->fileName);
+               }
  
                 --nfile;
                 vfdP->fd = VFD_CLOSED;
@@ -2697,6 +2707,9 @@ looks_like_temp_rel_name(const char *name)
   * harmless cases such as read-only files in the data directory, and that's
   * not good either.
   *
+ * Note that if we previously crashed due to a PANIC on fsync(), we'll be
+ * rewriting all changes again during recovery.
+ *
   * Note we assume we're chdir'd into PGDATA to begin with.
   */
  void
@@ -2978,3 +2991,26 @@ fsync_parent_path(const char *fname, int elevel)
  
         return 0;
  }
+
+/*
+ * Return the passed-in error level, or PANIC if data_sync_retry is off.
+ *
+ * Failure to fsync any data file is cause for immediate panic, unless
+ * data_sync_retry is enabled.  Data may have been written to the operating
+ * system and removed from our buffer pool already, and if we are running on
+ * an operating system that forgets dirty data on write-back failure, there
+ * may be only one copy of the data remaining: in the WAL.  A later attempt to
+ * fsync again might falsely report success.  Therefore we must not allow any
+ * further checkpoints to be attempted.  data_sync_retry can in theory be
+ * enabled on systems known not to drop dirty buffered data on write-back
+ * failure (with the likely outcome that checkpoints will continue to fail
+ * until the underlying problem is fixed).
+ *
+ * Any code that reports a failure from fsync() or related functions should
+ * filter the error level with this function.
+ */
+int
+data_sync_elevel(int elevel)
+{
+       return data_sync_retry ? elevel : PANIC;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 7f50fc96f44ec06012a47df235fc303f983a59e6..b910c35bf3f5502a3d3e77da8f6c19c400037cd5 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -963,7 +963,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
         while (v != NULL)
         {
                 if (FileSync(v->mdfd_vfd) < 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                         (errcode_for_file_access(),
                                          errmsg("could not fsync file \"%s\": %m",
                                                         FilePathName(v->mdfd_vfd))));
@@ -1206,7 +1206,7 @@ mdsync(void)
                                                         bms_join(new_requests, requests);
  
                                                 errno = save_errno;
-                                               ereport(ERROR,
+                                               ereport(data_sync_elevel(ERROR),
                                                                 (errcode_for_file_access(),
                                                                  errmsg("could not fsync file \"%s\": %m",
                                                                                 path)));
@@ -1380,7 +1380,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
                                 (errmsg("could not forward fsync request because request queue is full")));
  
                 if (FileSync(seg->mdfd_vfd) < 0)
-                       ereport(ERROR,
+                       ereport(data_sync_elevel(ERROR),
                                         (errcode_for_file_access(),
                                          errmsg("could not fsync file \"%s\": %m",
                                                         FilePathName(seg->mdfd_vfd))));
diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c

index 95a2689fd4263de4b91b855095f028a1e5742572..334a19d8adb6001450f328c11e30636da8799195 100644 (file)
--- a/src/backend/utils/cache/relmapper.c
+++ b/src/backend/utils/cache/relmapper.c
@@ -796,7 +796,7 @@ write_relmap_file(bool shared, RelMapFile *newmap,
          * CheckPointRelationMap.
          */
         if (pg_fsync(fd) != 0)
-               ereport(ERROR,
+               ereport(data_sync_elevel(ERROR),
                                 (errcode_for_file_access(),
                                  errmsg("could not fsync relation mapping file \"%s\": %m",
                                                 mapfilename)));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index b7e1603f8bdfbd2e98c7dcb6afe1d121d9dee9d6..cbdcdfe0423ef0a03e17e084e58972cf0c0cee81 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1512,6 +1512,15 @@ static struct config_bool ConfigureNamesBool[] =
                 NULL, NULL, NULL
         },
  
+       {
+               {"data_sync_retry", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
+                       gettext_noop("Whether to continue running after a failure to sync data files."),
+               },
+               &data_sync_retry,
+               false,
+               NULL, NULL, NULL
+       },
+
         /* End-of-list marker */
         {
                 {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index 93917b5edd00847e7d6b1a9a04d1bfadc3582b79..092284423e0df3db787715423453594b8f4f19aa 100644 (file)
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -587,6 +587,7 @@
  
  #exit_on_error = off                   # terminate session on any error?
  #restart_after_crash = on              # reinitialize after backend crash?
+#data_sync_retry = off                 # retry or panic on failure to fsync data?
  
  
  #------------------------------------------------------------------------------
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h

index c286d06d64c2195b76aaed454179449343c0081f..1015e25b04e0b35aacd0be0e57a0c083e24db47f 100644 (file)
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -53,6 +53,7 @@ typedef int File;
  
  /* GUC parameter */
  extern PGDLLIMPORT int max_files_per_process;
+extern PGDLLIMPORT bool data_sync_retry;
  
  /*
   * This is private to fd.c, but exported for save/restore_backend_variables()
@@ -119,6 +120,7 @@ extern void fsync_fname(const char *fname, bool isdir);
  extern int     durable_rename(const char *oldfile, const char *newfile, int loglevel);
  extern int     durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
  extern void SyncDataDirectory(void);
+extern int data_sync_elevel(int elevel);
  
  /* Filename components for OpenTemporaryFile */
  #define PG_TEMP_FILES_DIR "pgsql_tmp"
author	Thomas Munro <tmunro@postgresql.org>
	Mon, 19 Nov 2018 00:40:57 +0000 (13:40 +1300)
committer	Thomas Munro <tmunro@postgresql.org>
	Mon, 19 Nov 2018 01:26:28 +0000 (14:26 +1300)
doc/src/sgml/config.sgml		patch \| blob \| history
src/backend/access/heap/rewriteheap.c		patch \| blob \| history
src/backend/access/transam/slru.c		patch \| blob \| history
src/backend/access/transam/timeline.c		patch \| blob \| history
src/backend/access/transam/xlog.c		patch \| blob \| history
src/backend/replication/logical/snapbuild.c		patch \| blob \| history
src/backend/storage/file/fd.c		patch \| blob \| history
src/backend/storage/smgr/md.c		patch \| blob \| history
src/backend/utils/cache/relmapper.c		patch \| blob \| history
src/backend/utils/misc/guc.c		patch \| blob \| history
src/backend/utils/misc/postgresql.conf.sample		patch \| blob \| history
src/include/storage/fd.h		patch \| blob \| history