From 3a769d8239afdc003c91a56d2d8d5adfadacda5d Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 7 Nov 2018 18:05:54 +0100 Subject: [PATCH] pg_upgrade: Allow use of file cloning Add another transfer mode --clone to pg_upgrade (besides the existing --link and the default copy), using special file cloning calls. This makes the file transfer faster and more space efficient, achieving speed similar to --link mode without the associated drawbacks. On Linux, file cloning is supported on Btrfs and XFS (if formatted with reflink support). On macOS, file cloning is supported on APFS. Reviewed-by: Michael Paquier --- configure | 2 +- configure.in | 1 + doc/src/sgml/ref/pgupgrade.sgml | 40 +++++++++++--- src/bin/pg_upgrade/check.c | 13 ++++- src/bin/pg_upgrade/file.c | 90 ++++++++++++++++++++++++++++++++ src/bin/pg_upgrade/option.c | 7 +++ src/bin/pg_upgrade/pg_upgrade.h | 6 ++- src/bin/pg_upgrade/relfilenode.c | 44 ++++++++++------ src/include/pg_config.h.in | 3 ++ 9 files changed, 181 insertions(+), 25 deletions(-) diff --git a/configure b/configure index a600fdfac3..c76650aaf4 100755 --- a/configure +++ b/configure @@ -15130,7 +15130,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l +for ac_func in cbrt clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 75dbe8e734..8b0b6d1e43 100644 --- a/configure.in +++ b/configure.in @@ -1602,6 +1602,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` AC_CHECK_FUNCS(m4_normalize([ cbrt clock_gettime + copyfile fdatasync getifaddrs getpeerucred diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index d51146d641..2d722b2e79 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -182,6 +182,28 @@ display version information, then exit + + + + + Use efficient file cloning (also known as reflinks on + some systems) instead of copying files to the new cluster. This can + result in near-instantaneous copying of the data files, giving the + speed advantages of / while + leaving the old cluster untouched. + + + + File cloning is only supported on some operating systems and file + systems. If it is selected but not supported, the + pg_upgrade run will error. At present, it + is supported on Linux (kernel 4.5 or later) with Btrfs and XFS (on + file systems created with reflink support, which is not the default + for XFS at this writing), and on macOS with APFS. + + + + @@ -340,7 +362,7 @@ NET STOP postgresql-&majorversion; Always run the pg_upgrade binary of the new server, not the old one. pg_upgrade requires the specification of the old and new cluster's data and executable (bin) directories. You can also specify - user and port values, and whether you want the data files linked + user and port values, and whether you want the data files linked or cloned instead of the default copy behavior. @@ -351,8 +373,12 @@ NET STOP postgresql-&majorversion; once you start the new cluster after the upgrade. Link mode also requires that the old and new cluster data directories be in the same file system. (Tablespaces and pg_wal can be on - different file systems.) See pg_upgrade --help for a full - list of options. + different file systems.) + The clone mode provides the same speed and disk space advantages but will + not leave the old cluster unusable after the upgrade. The clone mode + also requires that the old and new data directories be in the same file + system. The clone mode is only available on certain operating systems + and file systems. @@ -388,8 +414,9 @@ pg_upgrade.exe to perform only the checks, even if the old server is still running. pg_upgrade --check will also outline any manual adjustments you will need to make after the upgrade. If you - are going to be using link mode, you should use the - option with to enable link-mode-specific checks. + are going to be using link or clone mode, you should use the option + or with + to enable mode-specific checks. pg_upgrade requires write permission in the current directory. @@ -722,7 +749,8 @@ psql --username=postgres --file=script.sql postgres If you want to use link mode and you do not want your old cluster - to be modified when the new cluster is started, make a copy of the + to be modified when the new cluster is started, consider using the clone mode. + If that is not available, make a copy of the old cluster and upgrade that in link mode. To make a valid copy of the old cluster, use rsync to create a dirty copy of the old cluster while the server is running, then shut down diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 5a78d603dc..555e5dcbba 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -149,8 +149,17 @@ check_new_cluster(void) check_loadable_libraries(); - if (user_opts.transfer_mode == TRANSFER_MODE_LINK) - check_hard_link(); + switch (user_opts.transfer_mode) + { + case TRANSFER_MODE_CLONE: + check_file_clone(); + break; + case TRANSFER_MODE_COPY: + break; + case TRANSFER_MODE_LINK: + check_hard_link(); + break; + } check_is_install_user(&new_cluster); diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index c27cc93dc2..244dd4d88b 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -18,6 +18,13 @@ #include #include +#ifdef HAVE_COPYFILE +#include +#endif +#ifdef __linux__ +#include +#include +#endif #ifdef WIN32 @@ -25,6 +32,47 @@ static int win32_pghardlink(const char *src, const char *dst); #endif +/* + * cloneFile() + * + * Clones/reflinks a relation file from src to dst. + * + * schemaName/relName are relation's SQL name (used for error messages only). + */ +void +cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName) +{ +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); +#elif defined(__linux__) && defined(FICLONE) + int src_fd; + int dest_fd; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n", + schemaName, relName, src, strerror(errno)); + + if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n", + schemaName, relName, dst, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + unlink(dst); + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } + + close(src_fd); + close(dest_fd); +#endif +} + + /* * copyFile() * @@ -270,6 +318,48 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, close(src_fd); } +void +check_file_clone(void) +{ + char existing_file[MAXPGPATH]; + char new_link_file[MAXPGPATH]; + + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata); + unlink(new_link_file); /* might fail */ + +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); +#elif defined(__linux__) && defined(FICLONE) + { + int src_fd; + int dest_fd; + + if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %s\n", + existing_file, strerror(errno)); + + if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %s\n", + new_link_file, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); + + close(src_fd); + close(dest_fd); + } +#else + pg_fatal("file cloning not supported on this platform\n"); +#endif + + unlink(new_link_file); +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 9dbc9225a6..7cc92f2dcb 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -53,6 +53,8 @@ parseCommandLine(int argc, char *argv[]) {"retain", no_argument, NULL, 'r'}, {"jobs", required_argument, NULL, 'j'}, {"verbose", no_argument, NULL, 'v'}, + {"clone", no_argument, NULL, 1}, + {NULL, 0, NULL, 0} }; int option; /* Command line option */ @@ -203,6 +205,10 @@ parseCommandLine(int argc, char *argv[]) log_opts.verbose = true; break; + case 1: + user_opts.transfer_mode = TRANSFER_MODE_CLONE; + break; + default: pg_fatal("Try \"%s --help\" for more information.\n", os_info.progname); @@ -293,6 +299,7 @@ usage(void) printf(_(" -U, --username=NAME cluster superuser (default \"%s\")\n"), os_info.user); printf(_(" -v, --verbose enable verbose internal logging\n")); printf(_(" -V, --version display version information, then exit\n")); + printf(_(" --clone clone instead of copying files to new cluster\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\n" "Before running pg_upgrade you must:\n" diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index f83a3eeb67..51bd211d46 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -230,10 +230,11 @@ typedef struct } ControlData; /* - * Enumeration to denote link modes + * Enumeration to denote transfer modes */ typedef enum { + TRANSFER_MODE_CLONE, TRANSFER_MODE_COPY, TRANSFER_MODE_LINK } transferMode; @@ -372,12 +373,15 @@ bool pid_lock_file_exists(const char *datadir); /* file.c */ +void cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName); void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName); void linkFile(const char *src, const char *dst, const char *schemaName, const char *relName); void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); +void check_file_clone(void); void check_hard_link(void); /* fopen_priv() is no longer different from fopen() */ diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index ed604f26ca..3b16c92a02 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -30,10 +30,18 @@ void transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata) { - if (user_opts.transfer_mode == TRANSFER_MODE_LINK) - pg_log(PG_REPORT, "Linking user relation files\n"); - else - pg_log(PG_REPORT, "Copying user relation files\n"); + switch (user_opts.transfer_mode) + { + case TRANSFER_MODE_CLONE: + pg_log(PG_REPORT, "Cloning user relation files\n"); + break; + case TRANSFER_MODE_COPY: + pg_log(PG_REPORT, "Copying user relation files\n"); + break; + case TRANSFER_MODE_LINK: + pg_log(PG_REPORT, "Linking user relation files\n"); + break; + } /* * Transferring files by tablespace is tricky because a single database @@ -250,17 +258,23 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro old_file, new_file); rewriteVisibilityMap(old_file, new_file, map->nspname, map->relname); } - else if (user_opts.transfer_mode == TRANSFER_MODE_COPY) - { - pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", - old_file, new_file); - copyFile(old_file, new_file, map->nspname, map->relname); - } else - { - pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", - old_file, new_file); - linkFile(old_file, new_file, map->nspname, map->relname); - } + switch (user_opts.transfer_mode) + { + case TRANSFER_MODE_CLONE: + pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"\n", + old_file, new_file); + cloneFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_COPY: + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", + old_file, new_file); + copyFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_LINK: + pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", + old_file, new_file); + linkFile(old_file, new_file, map->nspname, map->relname); + } } } diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 5a996e7557..b1f709358e 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -114,6 +114,9 @@ /* Define to 1 if your compiler handles computed gotos. */ #undef HAVE_COMPUTED_GOTO +/* Define to 1 if you have the `copyfile' function. */ +#undef HAVE_COPYFILE + /* Define to 1 if you have the header file. */ #undef HAVE_CRTDEFS_H -- 2.40.0