#ifdef PG_FLUSH_DATA_WORKS
static void pre_sync_fname(const char *fname, bool isdir, int elevel);
#endif
-static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
+static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+
+static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
+static int fsync_parent_path(const char *fname, int elevel);
/*
* indicate the OS just doesn't allow/require fsyncing directories.
*/
void
-fsync_fname(char *fname, bool isdir)
+fsync_fname(const char *fname, bool isdir)
+{
+ fsync_fname_ext(fname, isdir, false, ERROR);
+}
+
+/*
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ *
+ * This routine ensures that, after returning, the effect of renaming file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave you with either the pre-existing or the moved file in place of the
+ * new file; no mixed state or truncated files are possible.
+ *
+ * It does so by using fsync on the old filename and the possibly existing
+ * target filename before the rename, and the target file and directory after.
+ *
+ * Note that rename() cannot be used across arbitrary directories, as they
+ * might not be on the same filesystem. Therefore this routine does not
+ * support renaming across directories.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_rename(const char *oldfile, const char *newfile, int elevel)
{
int fd;
- int returncode;
/*
- * Some OSs require directories to be opened read-only whereas other
- * systems don't allow us to fsync files opened read-only; so we need both
- * cases here
+ * First fsync the old and target path (if it exists), to ensure that they
+ * are properly persistent on disk. Syncing the target file is not
+ * strictly necessary, but it makes it easier to reason about crashes;
+ * because it's then guaranteed that either source or target file exists
+ * after a crash.
*/
- if (!isdir)
- fd = OpenTransientFile(fname,
- O_RDWR | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+ return -1;
+
+ fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
+ if (fd < 0)
+ {
+ if (errno != ENOENT)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", newfile)));
+ return -1;
+ }
+ }
else
- fd = OpenTransientFile(fname,
- O_RDONLY | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ {
+ if (pg_fsync(fd) != 0)
+ {
+ int save_errno;
+
+ /* close file upon error, might not be in transaction context */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", newfile)));
+ return -1;
+ }
+ CloseTransientFile(fd);
+ }
+
+ /* Time to do the real deal... */
+ if (rename(oldfile, newfile) < 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ oldfile, newfile)));
+ return -1;
+ }
/*
- * Some OSs don't allow us to open directories at all (Windows returns
- * EACCES)
+ * To guarantee renaming the file is persistent, fsync the file with its
+ * new name, and its containing directory.
*/
- if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
- return;
+ if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+ return -1;
- else if (fd < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\": %m", fname)));
+ if (fsync_parent_path(newfile, elevel) != 0)
+ return -1;
- returncode = pg_fsync(fd);
+ return 0;
+}
+
+/*
+ * durable_link_or_rename -- rename a file in a durable manner.
+ *
+ * Similar to durable_rename(), except that this routine tries (but does not
+ * guarantee) not to overwrite the target file.
+ *
+ * Note that a crash in an unfortunate moment can leave you with two links to
+ * the target file.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
+{
+ /*
+ * Ensure that, if we crash directly after the rename/link, a file with
+ * valid contents is moved into place.
+ */
+ if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+ return -1;
- /* Some OSs don't allow us to fsync directories at all */
- if (returncode != 0 && isdir && errno == EBADF)
+#if HAVE_WORKING_LINK
+ if (link(oldfile, newfile) < 0)
{
- CloseTransientFile(fd);
- return;
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not link file \"%s\" to \"%s\": %m",
+ oldfile, newfile)));
+ return -1;
}
-
- if (returncode != 0)
- ereport(ERROR,
+ unlink(oldfile);
+#else
+ /* XXX: Add racy file existence check? */
+ if (rename(oldfile, newfile) < 0)
+ {
+ ereport(elevel,
(errcode_for_file_access(),
- errmsg("could not fsync file \"%s\": %m", fname)));
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ oldfile, newfile)));
+ return -1;
+ }
+#endif
- CloseTransientFile(fd);
-}
+ /*
+ * Make change persistent in case of an OS crash, both the new entry and
+ * its parent directory need to be flushed.
+ */
+ if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+ return -1;
+
+ /* Same for parent directory */
+ if (fsync_parent_path(newfile, elevel) != 0)
+ return -1;
+ return 0;
+}
/*
* InitFileAccess --- initialize this module during backend startup
* in pg_tblspc, they'll get fsync'd twice. That's not an expected case
* so we don't worry about optimizing it.
*/
- walkdir(".", fsync_fname_ext, false, LOG);
+ walkdir(".", datadir_fsync_fname, false, LOG);
if (xlog_is_symlink)
- walkdir("pg_xlog", fsync_fname_ext, false, LOG);
- walkdir("pg_tblspc", fsync_fname_ext, true, LOG);
+ walkdir("pg_xlog", datadir_fsync_fname, false, LOG);
+ walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
}
/*
#endif /* PG_FLUSH_DATA_WORKS */
+static void
+datadir_fsync_fname(const char *fname, bool isdir, int elevel)
+{
+ /*
+ * We want to silently ignoring errors about unreadable files. Pass that
+ * desire on to fsync_fname_ext().
+ */
+ fsync_fname_ext(fname, isdir, true, elevel);
+}
+
/*
* fsync_fname_ext -- Try to fsync a file or directory
*
- * Ignores errors trying to open unreadable files, or trying to fsync
- * directories on systems where that isn't allowed/required, and logs other
- * errors at a caller-specified level.
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
+ * files. Logs other errors at a caller-specified level.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise.
*/
-static void
-fsync_fname_ext(const char *fname, bool isdir, int elevel)
+static int
+fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
{
int fd;
int flags;
else
flags |= O_RDONLY;
+ fd = OpenTransientFile((char *) fname, flags, 0);
+
/*
- * Open the file, silently ignoring errors about unreadable files (or
- * unsupported operations, e.g. opening a directory under Windows), and
- * logging others.
+ * Some OSs don't allow us to open directories at all (Windows returns
+ * EACCES), just ignore the error in that case. If desired also silently
+ * ignoring errors about unreadable files. Log others.
*/
- fd = OpenTransientFile((char *) fname, flags, 0);
- if (fd < 0)
+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+ return 0;
+ else if (fd < 0 && ignore_perm && errno == EACCES)
+ return 0;
+ else if (fd < 0)
{
- if (errno == EACCES || (isdir && errno == EISDIR))
- return;
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", fname)));
- return;
+ return -1;
}
returncode = pg_fsync(fd);
* those errors. Anything else needs to be logged.
*/
if (returncode != 0 && !(isdir && errno == EBADF))
+ {
+ int save_errno;
+
+ /* close file upon error, might not be in transaction context */
+ save_errno = errno;
+ (void) CloseTransientFile(fd);
+ errno = save_errno;
+
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not fsync file \"%s\": %m", fname)));
+ return -1;
+ }
(void) CloseTransientFile(fd);
+
+ return 0;
+}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+static int
+fsync_parent_path(const char *fname, int elevel)
+{
+ char parentpath[MAXPGPATH];
+
+ strlcpy(parentpath, fname, MAXPGPATH);
+ get_parent_directory(parentpath);
+
+ /*
+ * get_parent_directory() returns an empty string if the input argument is
+ * just a file name (see comments in path.c), so handle that as being the
+ * current directory.
+ */
+ if (strlen(parentpath) == 0)
+ strlcpy(parentpath, ".", MAXPGPATH);
+
+ if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
+ return -1;
+
+ return 0;
}