From 7087166a88fe0c04fc6636d0d6d6bea1737fc1fb Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 11 Mar 2016 12:28:22 -0500 Subject: [PATCH] pg_upgrade: Convert old visibility map format to new format. Commit a892234f830e832110f63fc0a2afce2fb21d1584 added a second bit per page to the visibility map, but pg_upgrade has been unaware of it up until now. Therefore, a pg_upgrade from an earlier major release of PostgreSQL to any commit preceding this one and following the one mentioned above would result in invalid visibility map contents on the new cluster, very possibly leading to data corruption. This plugs that hole. Masahiko Sawada, reviewed by Jeff Janes, Bruce Momjian, Simon Riggs, Michael Paquier, Andres Freund, me, and others. --- src/bin/pg_upgrade/file.c | 154 +++++++++++++++++++++++++++++++ src/bin/pg_upgrade/pg_upgrade.h | 6 ++ src/bin/pg_upgrade/relfilenode.c | 48 +++++++--- 3 files changed, 197 insertions(+), 11 deletions(-) diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 2a99a286c6..5d87408ce9 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -9,10 +9,16 @@ #include "postgres_fe.h" +#include "access/visibilitymap.h" #include "pg_upgrade.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/checksum_impl.h" +#include #include +#define BITS_PER_HEAPBLOCK_OLD 1 #ifndef WIN32 @@ -138,6 +144,154 @@ copy_file(const char *srcfile, const char *dstfile, bool force) #endif +/* + * rewriteVisibilityMap() + * + * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's + * visibility map included one bit per heap page; it now includes two. + * When upgrading a cluster from before that time to a current PostgreSQL + * version, we could refuse to copy visibility maps from the old cluster + * to the new cluster; the next VACUUM would recreate them, but at the + * price of scanning the entire table. So, instead, we rewrite the old + * visibility maps in the new format. That way, the all-visible bit + * remains set for the pages for which it was set previously. The + * all-frozen bit is never set by this conversion; we leave that to + * VACUUM. + */ +const char * +rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force) +{ + int src_fd = 0; + int dst_fd = 0; + char buffer[BLCKSZ]; + ssize_t bytesRead; + ssize_t src_filesize; + int rewriteVmBytesPerPage; + BlockNumber new_blkno = 0; + struct stat statbuf; + + /* Compute we need how many old page bytes to rewrite a new page */ + rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; + + if ((fromfile == NULL) || (tofile == NULL)) + return "Invalid old file or new file"; + + if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0) + return getErrorText(); + + if (fstat(src_fd, &statbuf) != 0) + { + close(src_fd); + return getErrorText(); + } + + if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0) + { + close(src_fd); + return getErrorText(); + } + + /* Save old file size */ + src_filesize = statbuf.st_size; + + /* + * Turn each visibility map page into 2 pages one by one. Each new page + * has the same page header as the old one. If the last section of last + * page is empty, we skip it, mostly to avoid turning one-page visibility + * maps for small relations into two pages needlessly. + */ + while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ) + { + char *old_cur; + char *old_break; + char *old_blkend; + PageHeaderData pageheader; + bool old_lastblk = ((BLCKSZ * (new_blkno + 1)) == src_filesize); + + /* Save the page header data */ + memcpy(&pageheader, buffer, SizeOfPageHeaderData); + + /* + * These old_* variables point to old visibility map page. old_cur + * points to current position on old page. old_blkend points to end of + * old block. old_break points to old page break position for + * rewriting a new page. After wrote a new page, old_break proceeds + * rewriteVmBytesPerPage bytes. + */ + old_cur = buffer + SizeOfPageHeaderData; + old_blkend = buffer + bytesRead; + old_break = old_cur + rewriteVmBytesPerPage; + + while (old_blkend >= old_break) + { + char new_vmbuf[BLCKSZ]; + char *new_cur = new_vmbuf; + bool empty = true; + bool old_lastpart; + + /* Copy page header in advance */ + memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData); + + /* Rewrite the last part of the old page? */ + old_lastpart = old_lastblk && (old_blkend == old_break); + + new_cur += SizeOfPageHeaderData; + + /* Process old page bytes one by one, and turn it into new page. */ + while (old_break > old_cur) + { + uint16 new_vmbits = 0; + int i; + + /* Generate new format bits while keeping old information */ + for (i = 0; i < BITS_PER_BYTE; i++) + { + uint8 byte = *(uint8 *) old_cur; + + if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i))) + { + empty = false; + new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i); + } + } + + /* Copy new visibility map bit to new format page */ + memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK); + + old_cur += BITS_PER_HEAPBLOCK_OLD; + new_cur += BITS_PER_HEAPBLOCK; + } + + /* If the last part of the old page is empty, skip to write it */ + if (old_lastpart && empty) + break; + + /* Set new checksum for a visibility map page (if enabled) */ + if (old_cluster.controldata.data_checksum_version != 0 && + new_cluster.controldata.data_checksum_version != 0) + ((PageHeader) new_vmbuf)->pd_checksum = + pg_checksum_page(new_vmbuf, new_blkno); + + if (write(dst_fd, new_vmbuf, BLCKSZ) != BLCKSZ) + { + close(dst_fd); + close(src_fd); + return getErrorText(); + } + + old_break += rewriteVmBytesPerPage; + new_blkno++; + } + } + + /* Close files */ + close(dst_fd); + close(src_fd); + + return NULL; + +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 61228780d4..89beb73955 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -109,6 +109,10 @@ extern char *output_files[]; */ #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031 +/* + * The format of visibility map is changed with this 9.6 commit, + */ +#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011 /* * pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85, * ("Improve concurrency of foreign key locking") which also updated catalog @@ -365,6 +369,8 @@ bool pid_lock_file_exists(const char *datadir); const char *copyFile(const char *src, const char *dst, bool force); const char *linkFile(const char *src, const char *dst); +const char *rewriteVisibilityMap(const char *fromfile, const char *tofile, + bool force); void check_hard_link(void); FILE *fopen_priv(const char *path, const char *mode); diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index b20f073ef7..0c1a8220bb 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -11,12 +11,13 @@ #include "pg_upgrade.h" +#include #include "catalog/pg_class.h" #include "access/transam.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); -static void transfer_relfile(FileNameMap *map, const char *suffix); +static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit); /* @@ -132,6 +133,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) { int mapnum; bool vm_crashsafe_match = true; + bool vm_must_add_frozenbit = false; /* * Do the old and new cluster disagree on the crash-safetiness of the vm @@ -141,13 +143,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) vm_crashsafe_match = false; + /* + * Do we need to rewrite visibilitymap? + */ + if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER && + new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) + vm_must_add_frozenbit = true; + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0) { /* transfer primary file */ - transfer_relfile(&maps[mapnum], ""); + transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) @@ -155,9 +164,9 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) /* * Copy/link any fsm and vm files, if they exist */ - transfer_relfile(&maps[mapnum], "_fsm"); + transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit); if (vm_crashsafe_match) - transfer_relfile(&maps[mapnum], "_vm"); + transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit); } } } @@ -167,17 +176,19 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) /* * transfer_relfile() * - * Copy or link file from old cluster to new one. + * Copy or link file from old cluster to new one. If vm_must_add_frozenbit + * is true, visibility map forks are converted and rewritten, even in link + * mode. */ static void -transfer_relfile(FileNameMap *map, const char *type_suffix) +transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit) { const char *msg; char old_file[MAXPGPATH]; char new_file[MAXPGPATH]; - int fd; int segno; char extent_suffix[65]; + struct stat statbuf; /* * Now copy/link any related segments as well. Remember, PG breaks large @@ -210,7 +221,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) if (type_suffix[0] != '\0' || segno != 0) { /* Did file open fail? */ - if ((fd = open(old_file, O_RDONLY, 0)) == -1) + if (stat(old_file, &statbuf) != 0) { /* File does not exist? That's OK, just return */ if (errno == ENOENT) @@ -220,7 +231,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) map->nspname, map->relname, old_file, new_file, getErrorText()); } - close(fd); + + /* If file is empty, just return */ + if (statbuf.st_size == 0) + return; } unlink(new_file); @@ -232,7 +246,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) { pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file); - if ((msg = copyFile(old_file, new_file, true)) != NULL) + /* Rewrite visibility map if needed */ + if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0)) + msg = rewriteVisibilityMap(old_file, new_file, true); + else + msg = copyFile(old_file, new_file, true); + + if (msg) pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, msg); } @@ -240,7 +260,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) { pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file); - if ((msg = linkFile(old_file, new_file)) != NULL) + /* Rewrite visibility map if needed */ + if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0)) + msg = rewriteVisibilityMap(old_file, new_file, true); + else + msg = linkFile(old_file, new_file); + + if (msg) pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, msg); } -- 2.40.0