]> granicus.if.org Git - postgresql/commitdiff
Recursively fsync() the data directory after a crash.
authorRobert Haas <rhaas@postgresql.org>
Mon, 4 May 2015 16:06:53 +0000 (12:06 -0400)
committerRobert Haas <rhaas@postgresql.org>
Mon, 4 May 2015 18:04:53 +0000 (14:04 -0400)
Otherwise, if there's another crash, some writes from after the first
crash might make it to disk while writes from before the crash fail
to make it to disk.  This could lead to data corruption.

Back-patch to all supported versions.

Abhijit Menon-Sen, reviewed by Andres Freund and slightly revised
by me.

src/backend/access/transam/xlog.c
src/backend/storage/file/fd.c
src/backend/utils/misc/guc.c
src/include/storage/fd.h
src/include/utils/guc.h

index 8104cc8ad8e9123fc3ebb7403ed5b8800ea236bd..f02e812d70b0761ad8b07f7e8f5fad99aca3326b 100644 (file)
@@ -628,6 +628,7 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc);
 static void rm_redo_error_callback(void *arg);
 static int     get_sync_bit(int method);
 
+static void fsync_pgdata(char *datadir);
 
 /*
  * Insert an XLOG record having the specified RMID and info bytes,
@@ -5924,6 +5925,18 @@ StartupXLOG(void)
                          (errmsg("database system was interrupted; last known up at %s",
                                          str_time(ControlFile->time))));
 
+       /*
+        * If we previously crashed, there might be data which we had written,
+        * intending to fsync it, but which we had not actually fsync'd yet.
+        * Therefore, a power failure in the near future might cause earlier
+        * unflushed writes to be lost, even though more recent data written to
+        * disk from here on would be persisted.  To avoid that, fsync the entire
+        * data directory.
+        */
+       if (ControlFile->state != DB_SHUTDOWNED &&
+               ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+               fsync_pgdata(data_directory);
+
        /* This is just to allow attaching to startup process with a debugger */
 #ifdef XLOG_REPLAY_DELAY
        if (ControlFile->state != DB_SHUTDOWNED)
@@ -9967,3 +9980,31 @@ CheckForStandbyTrigger(void)
        }
        return false;
 }
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents.
+ */
+static void
+fsync_pgdata(char *datadir)
+{
+       if (!enableFsync)
+               return;
+
+       /*
+        * If possible, hint to the kernel that we're soon going to fsync
+        * the data directory and its contents.
+        */
+#if defined(HAVE_SYNC_FILE_RANGE) || \
+       (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
+       walkdir(datadir, pre_sync_fname);
+#endif
+
+       /*
+        * Now we do the fsync()s in the same order.
+        *
+        * It's important to fsync the destination directory itself as individual
+        * file fsyncs don't guarantee that the directory entry for the file is
+        * synced.
+        */
+       walkdir(datadir, fsync_fname);
+}
index ba80f70e08365993bec333a72b6e1de3c9f5236e..d21739dd454d865cabc76c7e374a4fb2085347bb 100644 (file)
@@ -2004,3 +2004,118 @@ fsync_fname(char *fname, bool isdir)
 
        close(fd);
 }
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Adapted from pre_sync_fname in initdb.c
+ */
+void
+pre_sync_fname(char *fname, bool isdir)
+{
+       int                     fd;
+
+       fd = open(fname, O_RDONLY | PG_BINARY);
+
+       /*
+        * Some OSs don't allow us to open directories at all (Windows returns
+        * EACCES)
+        */
+       if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+               return;
+
+       if (fd < 0)
+               ereport(FATAL,
+                               (errmsg("could not open file \"%s\" before fsync",
+                                               fname)));
+
+       pg_flush_data(fd, 0, 0);
+
+       close(fd);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself)
+ * and following symbolic links.
+ *
+ * NB: There is another version of walkdir in initdb.c, but that version
+ * behaves differently with respect to symbolic links.  Caveat emptor!
+ */
+void
+walkdir(char *path, void (*action) (char *fname, bool isdir))
+{
+       DIR                *dir;
+       struct dirent *de;
+
+       dir = AllocateDir(path);
+       while ((de = ReadDir(dir, path)) != NULL)
+       {
+               char            subpath[MAXPGPATH];
+               struct stat fst;
+
+               CHECK_FOR_INTERRUPTS();
+
+               if (strcmp(de->d_name, ".") == 0 ||
+                       strcmp(de->d_name, "..") == 0)
+                       continue;
+
+               snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+               if (lstat(subpath, &fst) < 0)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not stat file \"%s\": %m", subpath)));
+
+               if (S_ISREG(fst.st_mode))
+                       (*action) (subpath, false);
+               else if (S_ISDIR(fst.st_mode))
+                       walkdir(subpath, action);
+#ifndef WIN32
+               else if (S_ISLNK(fst.st_mode))
+#else
+               else if (pg_win32_is_junction(subpath))
+#endif
+               {
+#if defined(HAVE_READLINK) || defined(WIN32)
+                       char            linkpath[MAXPGPATH];
+                       int                     len;
+                       struct stat lst;
+
+                       len = readlink(subpath, linkpath, sizeof(linkpath)-1);
+                       if (len < 0)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not read symbolic link \"%s\": %m",
+                                                               subpath)));
+
+                       if (len >= sizeof(linkpath)-1)
+                               ereport(ERROR,
+                                               (errmsg("symbolic link \"%s\" target is too long",
+                                                               subpath)));
+
+                       linkpath[len] = '\0';
+
+                       if (lstat(linkpath, &lst) == 0)
+                       {
+                               if (S_ISREG(lst.st_mode))
+                                       (*action) (linkpath, false);
+                               else if (S_ISDIR(lst.st_mode))
+                                       walkdir(subpath, action);
+                       }
+                       else if (errno != ENOENT)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not stat file \"%s\": %m", linkpath)));
+#else
+                       ereport(WARNING,
+                                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                        errmsg("this platform does not support symbolic links; ignoring \"%s\"",
+                                                       subpath)));
+#endif
+               }
+       }
+       FreeDir(dir);
+
+       (*action) (path, true);
+}
index 718de952d037755b360cecf5c50de99919d15df9..c171539209e84dd37ed16b899a09489c916818a2 100644 (file)
@@ -377,6 +377,7 @@ int                 trace_recovery_messages = LOG;
 
 int                    num_temp_buffers = 1000;
 
+char      *data_directory;
 char      *ConfigFileName;
 char      *HbaFileName;
 char      *IdentFileName;
@@ -414,7 +415,6 @@ static char *timezone_string;
 static char *log_timezone_string;
 static char *timezone_abbreviations_string;
 static char *XactIsoLevel_string;
-static char *data_directory;
 static char *custom_variable_classes;
 static int     max_function_args;
 static int     max_index_keys;
index 014e0f866792257fe8f9d48f9fac0381074e56d4..3091c9f1c6e1911bbe6d657cf5374154ae74cb6f 100644 (file)
@@ -100,6 +100,8 @@ extern int  pg_fsync_writethrough(int fd);
 extern int     pg_fdatasync(int fd);
 extern int     pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(char *fname, bool isdir);
+extern void pre_sync_fname(char *fname, bool isdir);
+extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
index 2ec985161250628c26406de16742d2509eb5deb9..f455eef8b850521d83309e29fd5b2b7e11a73744 100644 (file)
@@ -176,6 +176,7 @@ extern int  log_temp_files;
 
 extern int     num_temp_buffers;
 
+extern char *data_directory;
 extern char *ConfigFileName;
 extern char *HbaFileName;
 extern char *IdentFileName;