]> granicus.if.org Git - postgresql/commitdiff
Fix fsync-at-startup code to not treat errors as fatal.
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 28 May 2015 21:33:03 +0000 (17:33 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 28 May 2015 21:33:03 +0000 (17:33 -0400)
Commit 2ce439f3379aed857517c8ce207485655000fc8e introduced a rather serious
regression, namely that if its scan of the data directory came across any
un-fsync-able files, it would fail and thereby prevent database startup.
Worse yet, symlinks to such files also caused the problem, which meant that
crash restart was guaranteed to fail on certain common installations such
as older Debian.

After discussion, we agreed that (1) failure to start is worse than any
consequence of not fsync'ing is likely to be, therefore treat all errors
in this code as nonfatal; (2) we should not chase symlinks other than
those that are expected to exist, namely pg_xlog/ and tablespace links
under pg_tblspc/.  The latter restriction avoids possibly fsync'ing a
much larger part of the filesystem than intended, if the user has left
random symlinks hanging about in the data directory.

This commit takes care of that and also does some code beautification,
mainly moving the relevant code into fd.c, which seems a much better place
for it than xlog.c, and making sure that the conditional compilation for
the pre_sync_fname pass has something to do with whether pg_flush_data
works.

I also relocated the call site in xlog.c down a few lines; it seems a
bit silly to be doing this before ValidateXLOGDirectoryStructure().

The similar logic in initdb.c ought to be made to match this, but that
change is noncritical and will be dealt with separately.

Back-patch to all active branches, like the prior commit.

Abhijit Menon-Sen and Tom Lane

src/backend/access/transam/xlog.c
src/backend/storage/file/fd.c
src/include/storage/fd.h

index f02e812d70b0761ad8b07f7e8f5fad99aca3326b..663c56c1c9f8d99e77a07d9b419ccf231e8a8d4a 100644 (file)
@@ -628,8 +628,6 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc);
 static void rm_redo_error_callback(void *arg);
 static int     get_sync_bit(int method);
 
-static void fsync_pgdata(char *datadir);
-
 /*
  * Insert an XLOG record having the specified RMID and info bytes,
  * with the body of the record being the data chunk(s) described by
@@ -5925,18 +5923,6 @@ StartupXLOG(void)
                          (errmsg("database system was interrupted; last known up at %s",
                                          str_time(ControlFile->time))));
 
-       /*
-        * If we previously crashed, there might be data which we had written,
-        * intending to fsync it, but which we had not actually fsync'd yet.
-        * Therefore, a power failure in the near future might cause earlier
-        * unflushed writes to be lost, even though more recent data written to
-        * disk from here on would be persisted.  To avoid that, fsync the entire
-        * data directory.
-        */
-       if (ControlFile->state != DB_SHUTDOWNED &&
-               ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
-               fsync_pgdata(data_directory);
-
        /* This is just to allow attaching to startup process with a debugger */
 #ifdef XLOG_REPLAY_DELAY
        if (ControlFile->state != DB_SHUTDOWNED)
@@ -5960,6 +5946,18 @@ StartupXLOG(void)
         */
        RelationCacheInitFileRemove();
 
+       /*
+        * If we previously crashed, there might be data which we had written,
+        * intending to fsync it, but which we had not actually fsync'd yet.
+        * Therefore, a power failure in the near future might cause earlier
+        * unflushed writes to be lost, even though more recent data written to
+        * disk from here on would be persisted.  To avoid that, fsync the entire
+        * data directory.
+        */
+       if (ControlFile->state != DB_SHUTDOWNED &&
+               ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+               SyncDataDirectory();
+
        /*
         * Initialize on the assumption we want to recover to the same timeline
         * that's active according to pg_control.
@@ -9980,31 +9978,3 @@ CheckForStandbyTrigger(void)
        }
        return false;
 }
-
-/*
- * Issue fsync recursively on PGDATA and all its contents.
- */
-static void
-fsync_pgdata(char *datadir)
-{
-       if (!enableFsync)
-               return;
-
-       /*
-        * If possible, hint to the kernel that we're soon going to fsync
-        * the data directory and its contents.
-        */
-#if defined(HAVE_SYNC_FILE_RANGE) || \
-       (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
-       walkdir(datadir, pre_sync_fname);
-#endif
-
-       /*
-        * Now we do the fsync()s in the same order.
-        *
-        * It's important to fsync the destination directory itself as individual
-        * file fsyncs don't guarantee that the directory entry for the file is
-        * synced.
-        */
-       walkdir(datadir, fsync_fname);
-}
index 89b108356062296b08419b472dd5907d16e512c6..7aaed00839cc5485ed2405252c2d0a0d5b9b9457 100644 (file)
 #include "utils/resowner.h"
 
 
+/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+#define PG_FLUSH_DATA_WORKS 1
+#endif
+
 /*
  * We must leave some file descriptors free for system(), the dynamic loader,
  * and other code that tries to open files without consulting fd.c.  This
@@ -252,10 +257,21 @@ static void FreeVfd(File file);
 
 static int     FileAccess(File file);
 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
+static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
+
 static void AtProcExit_Files(int code, Datum arg);
 static void CleanupTempFiles(bool isProcExit);
 static void RemovePgTempFilesInDir(const char *tmpdirname);
 
+static void walkdir(const char *path,
+               void (*action) (const char *fname, bool isdir, int elevel),
+               bool process_symlinks,
+               int elevel);
+#ifdef PG_FLUSH_DATA_WORKS
+static void pre_sync_fname(const char *fname, bool isdir, int elevel);
+#endif
+static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
+
 
 /*
  * pg_fsync --- do fsync with or without writethrough
@@ -331,15 +347,24 @@ pg_fdatasync(int fd)
  * pg_flush_data --- advise OS that the data described won't be needed soon
  *
  * Not all platforms have posix_fadvise; treat as noop if not available.
+ * Also, treat as no-op if enableFsync is off; this is because the call isn't
+ * free, and some platforms such as Linux will actually block the requestor
+ * until the write is scheduled.
  */
 int
 pg_flush_data(int fd, off_t offset, off_t amount)
 {
+#ifdef PG_FLUSH_DATA_WORKS
+       if (enableFsync)
+       {
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-       return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
+               return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
 #else
-       return 0;
+#error PG_FLUSH_DATA_WORKS should not have been defined
 #endif
+       }
+#endif
+       return 0;
 }
 
 
@@ -1599,15 +1624,28 @@ TryAgain:
  */
 struct dirent *
 ReadDir(DIR *dir, const char *dirname)
+{
+       return ReadDirExtended(dir, dirname, ERROR);
+}
+
+/*
+ * Alternate version that allows caller to specify the elevel for any
+ * error report.  If elevel < ERROR, returns NULL on any error.
+ */
+static struct dirent *
+ReadDirExtended(DIR *dir, const char *dirname, int elevel)
 {
        struct dirent *dent;
 
        /* Give a generic message for AllocateDir failure, if caller didn't */
        if (dir == NULL)
-               ereport(ERROR,
+       {
+               ereport(elevel,
                                (errcode_for_file_access(),
                                 errmsg("could not open directory \"%s\": %m",
                                                dirname)));
+               return NULL;
+       }
 
        errno = 0;
        if ((dent = readdir(dir)) != NULL)
@@ -1620,7 +1658,7 @@ ReadDir(DIR *dir, const char *dirname)
 #endif
 
        if (errno)
-               ereport(ERROR,
+               ereport(elevel,
                                (errcode_for_file_access(),
                                 errmsg("could not read directory \"%s\": %m",
                                                dirname)));
@@ -2005,54 +2043,121 @@ fsync_fname(char *fname, bool isdir)
        close(fd);
 }
 
+
 /*
- * Hint to the OS that it should get ready to fsync() this file.
+ * Issue fsync recursively on PGDATA and all its contents.
+ *
+ * We fsync regular files and directories wherever they are, but we
+ * follow symlinks only for pg_xlog and immediately under pg_tblspc.
+ * Other symlinks are presumed to point at files we're not responsible
+ * for fsyncing, and might not have privileges to write at all.
+ *
+ * Errors are logged but not considered fatal; that's because this is used
+ * only during database startup, to deal with the possibility that there are
+ * issued-but-unsynced writes pending against the data directory.  We want to
+ * ensure that such writes reach disk before anything that's done in the new
+ * run.  However, aborting on error would result in failure to start for
+ * harmless cases such as read-only files in the data directory, and that's
+ * not good either.
  *
- * Adapted from pre_sync_fname in initdb.c
+ * Note we assume we're chdir'd into PGDATA to begin with.
  */
 void
-pre_sync_fname(char *fname, bool isdir)
+SyncDataDirectory(void)
 {
-       int                     fd;
+       bool            xlog_is_symlink;
 
-       fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0);
+       /* We can skip this whole thing if fsync is disabled. */
+       if (!enableFsync)
+               return;
 
        /*
-        * Some OSs don't allow us to open directories at all (Windows returns
-        * EACCES)
+        * If pg_xlog is a symlink, we'll need to recurse into it separately,
+        * because the first walkdir below will ignore it.
         */
-       if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
-               return;
+       xlog_is_symlink = false;
 
-       if (fd < 0)
-               ereport(FATAL,
-                               (errmsg("could not open file \"%s\": %m",
-                                               fname)));
+#ifndef WIN32
+       {
+               struct stat st;
 
-       pg_flush_data(fd, 0, 0);
+               if (lstat("pg_xlog", &st) < 0)
+                       ereport(LOG,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not stat file \"%s\": %m",
+                                                       "pg_xlog")));
+               else if (S_ISLNK(st.st_mode))
+                       xlog_is_symlink = true;
+       }
+#else
+       if (pgwin32_is_junction("pg_xlog"))
+               xlog_is_symlink = true;
+#endif
 
-       close(fd);
+       /*
+        * If possible, hint to the kernel that we're soon going to fsync the data
+        * directory and its contents.  Errors in this step are even less
+        * interesting than normal, so log them only at DEBUG1.
+        */
+#ifdef PG_FLUSH_DATA_WORKS
+       walkdir(".", pre_sync_fname, false, DEBUG1);
+       if (xlog_is_symlink)
+               walkdir("pg_xlog", pre_sync_fname, false, DEBUG1);
+       walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
+#endif
+
+       /*
+        * Now we do the fsync()s in the same order.
+        *
+        * The main call ignores symlinks, so in addition to specially processing
+        * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with
+        * process_symlinks = true.  Note that if there are any plain directories
+        * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
+        * so we don't worry about optimizing it.
+        */
+       walkdir(".", fsync_fname_ext, false, LOG);
+       if (xlog_is_symlink)
+               walkdir("pg_xlog", fsync_fname_ext, false, LOG);
+       walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
 }
 
 /*
  * walkdir: recursively walk a directory, applying the action to each
- * regular file and directory (including the named directory itself)
- * and following symbolic links.
+ * regular file and directory (including the named directory itself).
+ *
+ * If process_symlinks is true, the action and recursion are also applied
+ * to regular files and directories that are pointed to by symlinks in the
+ * given directory; otherwise symlinks are ignored.  Symlinks are always
+ * ignored in subdirectories, ie we intentionally don't pass down the
+ * process_symlinks flag to recursive calls.
  *
- * NB: There is another version of walkdir in initdb.c, but that version
- * behaves differently with respect to symbolic links.  Caveat emptor!
+ * Errors are reported at level elevel, which might be ERROR or less.
+ *
+ * See also walkdir in initdb.c, which is a frontend version of this logic.
  */
-void
-walkdir(char *path, void (*action) (char *fname, bool isdir))
+static void
+walkdir(const char *path,
+               void (*action) (const char *fname, bool isdir, int elevel),
+               bool process_symlinks,
+               int elevel)
 {
        DIR                *dir;
        struct dirent *de;
 
        dir = AllocateDir(path);
-       while ((de = ReadDir(dir, path)) != NULL)
+       if (dir == NULL)
+       {
+               ereport(elevel,
+                               (errcode_for_file_access(),
+                                errmsg("could not open directory \"%s\": %m", path)));
+               return;
+       }
+
+       while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
        {
                char            subpath[MAXPGPATH];
                struct stat fst;
+               int                     sret;
 
                CHECK_FOR_INTERRUPTS();
 
@@ -2062,60 +2167,132 @@ walkdir(char *path, void (*action) (char *fname, bool isdir))
 
                snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
 
-               if (lstat(subpath, &fst) < 0)
-                       ereport(ERROR,
+               if (process_symlinks)
+                       sret = stat(subpath, &fst);
+               else
+                       sret = lstat(subpath, &fst);
+
+               if (sret < 0)
+               {
+                       ereport(elevel,
                                        (errcode_for_file_access(),
                                         errmsg("could not stat file \"%s\": %m", subpath)));
+                       continue;
+               }
 
                if (S_ISREG(fst.st_mode))
-                       (*action) (subpath, false);
+                       (*action) (subpath, false, elevel);
                else if (S_ISDIR(fst.st_mode))
-                       walkdir(subpath, action);
-#ifndef WIN32
-               else if (S_ISLNK(fst.st_mode))
-#else
-               else if (pgwin32_is_junction(subpath))
+                       walkdir(subpath, action, false, elevel);
+       }
+
+       FreeDir(dir);                           /* we ignore any error here */
+
+       /*
+        * It's important to fsync the destination directory itself as individual
+        * file fsyncs don't guarantee that the directory entry for the file is
+        * synced.
+        */
+       (*action) (path, true, elevel);
+}
+
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Ignores errors trying to open unreadable files, and logs other errors at a
+ * caller-specified level.
+ */
+#ifdef PG_FLUSH_DATA_WORKS
+
+static void
+pre_sync_fname(const char *fname, bool isdir, int elevel)
+{
+       int                     fd;
+
+       fd = BasicOpenFile((char *) fname, O_RDONLY | PG_BINARY, 0);
+
+       if (fd < 0)
+       {
+               if (errno == EACCES || (isdir && errno == EISDIR))
+                       return;
+
+#ifdef ETXTBSY
+               if (errno == ETXTBSY)
+                       return;
 #endif
-               {
-#if defined(HAVE_READLINK) || defined(WIN32)
-                       char            linkpath[MAXPGPATH];
-                       int                     len;
-                       struct stat lst;
-
-                       len = readlink(subpath, linkpath, sizeof(linkpath)-1);
-                       if (len < 0)
-                               ereport(ERROR,
-                                               (errcode_for_file_access(),
-                                                errmsg("could not read symbolic link \"%s\": %m",
-                                                               subpath)));
-
-                       if (len >= sizeof(linkpath)-1)
-                               ereport(ERROR,
-                                               (errmsg("symbolic link \"%s\" target is too long",
-                                                               subpath)));
-
-                       linkpath[len] = '\0';
-
-                       if (lstat(linkpath, &lst) == 0)
-                       {
-                               if (S_ISREG(lst.st_mode))
-                                       (*action) (linkpath, false);
-                               else if (S_ISDIR(lst.st_mode))
-                                       walkdir(subpath, action);
-                       }
-                       else if (errno != ENOENT)
-                               ereport(ERROR,
-                                               (errcode_for_file_access(),
-                                                errmsg("could not stat file \"%s\": %m", linkpath)));
-#else
-                       ereport(WARNING,
-                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                                        errmsg("this platform does not support symbolic links; ignoring \"%s\"",
-                                                       subpath)));
+
+               ereport(elevel,
+                               (errcode_for_file_access(),
+                                errmsg("could not open file \"%s\": %m", fname)));
+               return;
+       }
+
+       (void) pg_flush_data(fd, 0, 0);
+
+       (void) close(fd);
+}
+
+#endif   /* PG_FLUSH_DATA_WORKS */
+
+/*
+ * fsync_fname_ext -- Try to fsync a file or directory
+ *
+ * Ignores errors trying to open unreadable files, or trying to fsync
+ * directories on systems where that isn't allowed/required, and logs other
+ * errors at a caller-specified level.
+ */
+static void
+fsync_fname_ext(const char *fname, bool isdir, int elevel)
+{
+       int                     fd;
+       int                     flags;
+       int                     returncode;
+
+       /*
+        * Some OSs require directories to be opened read-only whereas other
+        * systems don't allow us to fsync files opened read-only; so we need both
+        * cases here.  Using O_RDWR will cause us to fail to fsync files that are
+        * not writable by our userid, but we assume that's OK.
+        */
+       flags = PG_BINARY;
+       if (!isdir)
+               flags |= O_RDWR;
+       else
+               flags |= O_RDONLY;
+
+       /*
+        * Open the file, silently ignoring errors about unreadable files (or
+        * unsupported operations, e.g. opening a directory under Windows), and
+        * logging others.
+        */
+       fd = BasicOpenFile((char *) fname, flags, 0);
+       if (fd < 0)
+       {
+               if (errno == EACCES || (isdir && errno == EISDIR))
+                       return;
+
+#ifdef ETXTBSY
+               if (errno == ETXTBSY)
+                       return;
 #endif
-               }
+
+               ereport(elevel,
+                               (errcode_for_file_access(),
+                                errmsg("could not open file \"%s\": %m", fname)));
+               return;
        }
-       FreeDir(dir);
 
-       (*action) (path, true);
+       returncode = pg_fsync(fd);
+
+       /*
+        * Some OSes don't allow us to fsync directories at all, so we can ignore
+        * those errors. Anything else needs to be logged.
+        */
+       if (returncode != 0 && !(isdir && errno == EBADF))
+               ereport(elevel,
+                               (errcode_for_file_access(),
+                                errmsg("could not fsync file \"%s\": %m", fname)));
+
+       (void) close(fd);
 }
index 3091c9f1c6e1911bbe6d657cf5374154ae74cb6f..1f11e7b7dcaa060957c212624ca0c8ed79825903 100644 (file)
@@ -100,8 +100,7 @@ extern int  pg_fsync_writethrough(int fd);
 extern int     pg_fdatasync(int fd);
 extern int     pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(char *fname, bool isdir);
-extern void pre_sync_fname(char *fname, bool isdir);
-extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
+extern void SyncDataDirectory(void);
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"