]> granicus.if.org Git - postgresql/commitdiff
pg_upgrade: Convert old visibility map format to new format.
authorRobert Haas <rhaas@postgresql.org>
Fri, 11 Mar 2016 17:28:22 +0000 (12:28 -0500)
committerRobert Haas <rhaas@postgresql.org>
Fri, 11 Mar 2016 17:34:20 +0000 (12:34 -0500)
Commit a892234f830e832110f63fc0a2afce2fb21d1584 added a second bit per
page to the visibility map, but pg_upgrade has been unaware of it up
until now.  Therefore, a pg_upgrade from an earlier major release of
PostgreSQL to any commit preceding this one and following the one
mentioned above would result in invalid visibility map contents on the
new cluster, very possibly leading to data corruption.  This plugs
that hole.

Masahiko Sawada, reviewed by Jeff Janes, Bruce Momjian, Simon Riggs,
Michael Paquier, Andres Freund, me, and others.

src/bin/pg_upgrade/file.c
src/bin/pg_upgrade/pg_upgrade.h
src/bin/pg_upgrade/relfilenode.c

index 2a99a286c6dadd119610b6e5a4e0d2a7b28c4980..5d87408ce93572ef4ecf52fa3c6eaf2032a27570 100644 (file)
@@ -9,10 +9,16 @@
 
 #include "postgres_fe.h"
 
+#include "access/visibilitymap.h"
 #include "pg_upgrade.h"
+#include "storage/bufpage.h"
+#include "storage/checksum.h"
+#include "storage/checksum_impl.h"
 
+#include <sys/stat.h>
 #include <fcntl.h>
 
+#define BITS_PER_HEAPBLOCK_OLD 1
 
 
 #ifndef WIN32
@@ -138,6 +144,154 @@ copy_file(const char *srcfile, const char *dstfile, bool force)
 #endif
 
 
+/*
+ * rewriteVisibilityMap()
+ *
+ * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
+ * visibility map included one bit per heap page; it now includes two.
+ * When upgrading a cluster from before that time to a current PostgreSQL
+ * version, we could refuse to copy visibility maps from the old cluster
+ * to the new cluster; the next VACUUM would recreate them, but at the
+ * price of scanning the entire table.  So, instead, we rewrite the old
+ * visibility maps in the new format.  That way, the all-visible bit
+ * remains set for the pages for which it was set previously.  The
+ * all-frozen bit is never set by this conversion; we leave that to
+ * VACUUM.
+ */
+const char *
+rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force)
+{
+       int                     src_fd = 0;
+       int                     dst_fd = 0;
+       char            buffer[BLCKSZ];
+       ssize_t         bytesRead;
+       ssize_t         src_filesize;
+       int                     rewriteVmBytesPerPage;
+       BlockNumber new_blkno = 0;
+       struct stat statbuf;
+
+       /* Compute we need how many old page bytes to rewrite a new page */
+       rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
+
+       if ((fromfile == NULL) || (tofile == NULL))
+               return "Invalid old file or new file";
+
+       if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0)
+               return getErrorText();
+
+       if (fstat(src_fd, &statbuf) != 0)
+       {
+               close(src_fd);
+               return getErrorText();
+       }
+
+       if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0)
+       {
+               close(src_fd);
+               return getErrorText();
+       }
+
+       /* Save old file size */
+       src_filesize = statbuf.st_size;
+
+       /*
+        * Turn each visibility map page into 2 pages one by one. Each new page
+        * has the same page header as the old one.  If the last section of last
+        * page is empty, we skip it, mostly to avoid turning one-page visibility
+        * maps for small relations into two pages needlessly.
+        */
+       while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ)
+       {
+               char       *old_cur;
+               char       *old_break;
+               char       *old_blkend;
+               PageHeaderData pageheader;
+               bool            old_lastblk = ((BLCKSZ * (new_blkno + 1)) == src_filesize);
+
+               /* Save the page header data */
+               memcpy(&pageheader, buffer, SizeOfPageHeaderData);
+
+               /*
+                * These old_* variables point to old visibility map page. old_cur
+                * points to current position on old page. old_blkend points to end of
+                * old block. old_break points to old page break position for
+                * rewriting a new page. After wrote a new page, old_break proceeds
+                * rewriteVmBytesPerPage bytes.
+                */
+               old_cur = buffer + SizeOfPageHeaderData;
+               old_blkend = buffer + bytesRead;
+               old_break = old_cur + rewriteVmBytesPerPage;
+
+               while (old_blkend >= old_break)
+               {
+                       char            new_vmbuf[BLCKSZ];
+                       char       *new_cur = new_vmbuf;
+                       bool            empty = true;
+                       bool            old_lastpart;
+
+                       /* Copy page header in advance */
+                       memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData);
+
+                       /* Rewrite the last part of the old page? */
+                       old_lastpart = old_lastblk && (old_blkend == old_break);
+
+                       new_cur += SizeOfPageHeaderData;
+
+                       /* Process old page bytes one by one, and turn it into new page. */
+                       while (old_break > old_cur)
+                       {
+                               uint16          new_vmbits = 0;
+                               int                     i;
+
+                               /* Generate new format bits while keeping old information */
+                               for (i = 0; i < BITS_PER_BYTE; i++)
+                               {
+                                       uint8           byte = *(uint8 *) old_cur;
+
+                                       if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i)))
+                                       {
+                                               empty = false;
+                                               new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i);
+                                       }
+                               }
+
+                               /* Copy new visibility map bit to new format page */
+                               memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK);
+
+                               old_cur += BITS_PER_HEAPBLOCK_OLD;
+                               new_cur += BITS_PER_HEAPBLOCK;
+                       }
+
+                       /* If the last part of the old page is empty, skip to write it */
+                       if (old_lastpart && empty)
+                               break;
+
+                       /* Set new checksum for a visibility map page (if enabled) */
+                       if (old_cluster.controldata.data_checksum_version != 0 &&
+                               new_cluster.controldata.data_checksum_version != 0)
+                               ((PageHeader) new_vmbuf)->pd_checksum =
+                                       pg_checksum_page(new_vmbuf, new_blkno);
+
+                       if (write(dst_fd, new_vmbuf, BLCKSZ) != BLCKSZ)
+                       {
+                               close(dst_fd);
+                               close(src_fd);
+                               return getErrorText();
+                       }
+
+                       old_break += rewriteVmBytesPerPage;
+                       new_blkno++;
+               }
+       }
+
+       /* Close files */
+       close(dst_fd);
+       close(src_fd);
+
+       return NULL;
+
+}
+
 void
 check_hard_link(void)
 {
index 61228780d433282994e0cb3ca7f64341f6909b95..89beb739556e22d80af649424ccfbf787416add2 100644 (file)
@@ -109,6 +109,10 @@ extern char *output_files[];
  */
 #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031
 
+/*
+ * The format of visibility map is changed with this 9.6 commit,
+ */
+#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011
 /*
  * pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85,
  * ("Improve concurrency of foreign key locking") which also updated catalog
@@ -365,6 +369,8 @@ bool                pid_lock_file_exists(const char *datadir);
 
 const char *copyFile(const char *src, const char *dst, bool force);
 const char *linkFile(const char *src, const char *dst);
+const char *rewriteVisibilityMap(const char *fromfile, const char *tofile,
+                                                                bool force);
 
 void           check_hard_link(void);
 FILE      *fopen_priv(const char *path, const char *mode);
index b20f073ef71c35b63cb7fd0f2a980888caeccfa3..0c1a8220bbfba61b1a9264c1b4ff19a1d14b4c28 100644 (file)
 
 #include "pg_upgrade.h"
 
+#include <sys/stat.h>
 #include "catalog/pg_class.h"
 #include "access/transam.h"
 
 
 static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
-static void transfer_relfile(FileNameMap *map, const char *suffix);
+static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit);
 
 
 /*
@@ -132,6 +133,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 {
        int                     mapnum;
        bool            vm_crashsafe_match = true;
+       bool            vm_must_add_frozenbit = false;
 
        /*
         * Do the old and new cluster disagree on the crash-safetiness of the vm
@@ -141,13 +143,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
                new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER)
                vm_crashsafe_match = false;
 
+       /*
+        * Do we need to rewrite visibilitymap?
+        */
+       if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER &&
+               new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER)
+               vm_must_add_frozenbit = true;
+
        for (mapnum = 0; mapnum < size; mapnum++)
        {
                if (old_tablespace == NULL ||
                        strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0)
                {
                        /* transfer primary file */
-                       transfer_relfile(&maps[mapnum], "");
+                       transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit);
 
                        /* fsm/vm files added in PG 8.4 */
                        if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
@@ -155,9 +164,9 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
                                /*
                                 * Copy/link any fsm and vm files, if they exist
                                 */
-                               transfer_relfile(&maps[mapnum], "_fsm");
+                               transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit);
                                if (vm_crashsafe_match)
-                                       transfer_relfile(&maps[mapnum], "_vm");
+                                       transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit);
                        }
                }
        }
@@ -167,17 +176,19 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 /*
  * transfer_relfile()
  *
- * Copy or link file from old cluster to new one.
+ * Copy or link file from old cluster to new one.  If vm_must_add_frozenbit
+ * is true, visibility map forks are converted and rewritten, even in link
+ * mode.
  */
 static void
-transfer_relfile(FileNameMap *map, const char *type_suffix)
+transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit)
 {
        const char *msg;
        char            old_file[MAXPGPATH];
        char            new_file[MAXPGPATH];
-       int                     fd;
        int                     segno;
        char            extent_suffix[65];
+       struct stat statbuf;
 
        /*
         * Now copy/link any related segments as well. Remember, PG breaks large
@@ -210,7 +221,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
                if (type_suffix[0] != '\0' || segno != 0)
                {
                        /* Did file open fail? */
-                       if ((fd = open(old_file, O_RDONLY, 0)) == -1)
+                       if (stat(old_file, &statbuf) != 0)
                        {
                                /* File does not exist?  That's OK, just return */
                                if (errno == ENOENT)
@@ -220,7 +231,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
                                                         map->nspname, map->relname, old_file, new_file,
                                                         getErrorText());
                        }
-                       close(fd);
+
+                       /* If file is empty, just return */
+                       if (statbuf.st_size == 0)
+                               return;
                }
 
                unlink(new_file);
@@ -232,7 +246,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
                {
                        pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file);
 
-                       if ((msg = copyFile(old_file, new_file, true)) != NULL)
+                       /* Rewrite visibility map if needed */
+                       if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
+                               msg = rewriteVisibilityMap(old_file, new_file, true);
+                       else
+                               msg = copyFile(old_file, new_file, true);
+
+                       if (msg)
                                pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
                                                 map->nspname, map->relname, old_file, new_file, msg);
                }
@@ -240,7 +260,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
                {
                        pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file);
 
-                       if ((msg = linkFile(old_file, new_file)) != NULL)
+                       /* Rewrite visibility map if needed */
+                       if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
+                               msg = rewriteVisibilityMap(old_file, new_file, true);
+                       else
+                               msg = linkFile(old_file, new_file);
+
+                       if (msg)
                                pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
                                                 map->nspname, map->relname, old_file, new_file, msg);
                }