From 13a8924ecf00b57b118e307f9b01503f07dd3b28 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 8 Oct 2017 15:25:26 -0400
Subject: [PATCH] Increase distance between flush requests during bulk file
 copies.

copy_file() reads and writes data 64KB at a time (with default BLCKSZ),
and historically has issued a pg_flush_data request after each write.
This turns out to interact really badly with macOS's new APFS file
system: a large file copy takes over 100X longer than it ought to on
APFS, as reported by Brent Dearth.  While that's arguably a macOS bug,
it's not clear whether Apple will do anything about it in the near
future, and in any case experimentation suggests that issuing flushes
a bit less often can be helpful on other platforms too.

Hence, rearrange the logic in copy_file() so that flush requests are
issued once per N writes rather than every time through the loop.
I set the FLUSH_DISTANCE to 32MB on macOS (any less than that still
results in a noticeable speed degradation on APFS), but 1MB elsewhere.
In limited testing on Linux and FreeBSD, this seems slightly faster
than the previous code, and certainly no worse.  It helps noticeably
on macOS even with the older HFS filesystem.

A simpler change would have been to just increase the size of the
copy buffer without changing the loop logic, but that seems likely
to trash the processor cache without really helping much.

Back-patch to 9.6 where we introduced msync() as an implementation
option for pg_flush_data().  The problem seems specific to APFS's
mmap/msync support, so I don't think we need to go further back.

Discussion: https://postgr.es/m/CADkxhTNv-j2jw2g8H57deMeAbfRgYBoLmVuXkC=YCFBXRuCOww@mail.gmail.com
---
 src/backend/storage/file/copydir.c | 38 +++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index a964e47f50..ec467d9a86 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -139,10 +139,24 @@ copy_file(char *fromfile, char *tofile)
 	int			dstfd;
 	int			nbytes;
 	off_t		offset;
+	off_t		flush_offset;
 
-	/* Use palloc to ensure we get a maxaligned buffer */
+	/* Size of copy buffer (read and write requests) */
 #define COPY_BUF_SIZE (8 * BLCKSZ)
 
+	/*
+	 * Size of data flush requests.  It seems beneficial on most platforms to
+	 * do this every 1MB or so.  But macOS, at least with early releases of
+	 * APFS, is really unfriendly to small mmap/msync requests, so there do it
+	 * only every 32MB.
+	 */
+#if defined(__darwin__)
+#define FLUSH_DISTANCE (32 * 1024 * 1024)
+#else
+#define FLUSH_DISTANCE (1024 * 1024)
+#endif
+
+	/* Use palloc to ensure we get a maxaligned buffer */
 	buffer = palloc(COPY_BUF_SIZE);
 
 	/*
@@ -164,11 +178,23 @@ copy_file(char *fromfile, char *tofile)
 	/*
 	 * Do the data copying.
 	 */
+	flush_offset = 0;
 	for (offset = 0;; offset += nbytes)
 	{
 		/* If we got a cancel signal during the copy of the file, quit */
 		CHECK_FOR_INTERRUPTS();
 
+		/*
+		 * We fsync the files later, but during the copy, flush them every so
+		 * often to avoid spamming the cache and hopefully get the kernel to
+		 * start writing them out before the fsync comes.
+		 */
+		if (offset - flush_offset >= FLUSH_DISTANCE)
+		{
+			pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+			flush_offset = offset;
+		}
+
 		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
 		if (nbytes < 0)
 			ereport(ERROR,
@@ -186,15 +212,11 @@ copy_file(char *fromfile, char *tofile)
 					(errcode_for_file_access(),
 					 errmsg("could not write to file \"%s\": %m", tofile)));
 		}
-
-		/*
-		 * We fsync the files later but first flush them to avoid spamming the
-		 * cache and hopefully get the kernel to start writing them out before
-		 * the fsync comes.
-		 */
-		pg_flush_data(dstfd, offset, nbytes);
 	}
 
+	if (offset > flush_offset)
+		pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+
 	if (CloseTransientFile(dstfd))
 		ereport(ERROR,
 				(errcode_for_file_access(),
-- 
2.40.0