Add parallel pg_dump option.

author Andrew Dunstan <andrew@dunslane.net>

Sun, 24 Mar 2013 15:27:20 +0000 (11:27 -0400)

committer Andrew Dunstan <andrew@dunslane.net>

Sun, 24 Mar 2013 15:27:20 +0000 (11:27 -0400)
author Andrew Dunstan <andrew@dunslane.net>
Sun, 24 Mar 2013 15:27:20 +0000 (11:27 -0400)
committer Andrew Dunstan <andrew@dunslane.net>
Sun, 24 Mar 2013 15:27:20 +0000 (11:27 -0400)
diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml

index c4215bed9860ea2df7f7af012cca406012e9d7c9..e444b1cde3dae530b99011d7c54b8edd887a3d0e 100644 (file)
--- a/doc/src/sgml/backup.sgml
+++ b/doc/src/sgml/backup.sgml
@@ -310,6 +310,24 @@ pg_restore -d <replaceable class="parameter">dbname</replaceable> <replaceable c
      with one of the other two approaches.
     </para>
  
+   <formalpara>
+    <title>Use <application>pg_dump</>'s parallel dump feature.</title>
+    <para>
+     To speed up the dump of a large database, you can use
+     <application>pg_dump</application>'s parallel mode. This will dump
+     multiple tables at the same time. You can control the degree of
+     parallelism with the <command>-j</command> parameter. Parallel dumps
+     are only supported for the "directory" archive format.
+
+<programlisting>
+pg_dump -j <replaceable class="parameter">num</replaceable> -F d -f <replaceable class="parameter">out.dir</replaceable> <replaceable class="parameter">dbname</replaceable>
+</programlisting>
+
+     You can use <command>pg_restore -j</command> to restore a dump in parallel.
+     This will work for any archive of either the "custom" or the "directory"
+     archive mode, whether or not it has been created with <command>pg_dump -j</command>.
+    </para>
+   </formalpara>
    </sect2>
   </sect1>
  
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml

index 1e7544afeb49e58e48fa696822a8730cd2b105e9..34eace35b6e90b649e334b0b113322f75dc79844 100644 (file)
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1433,6 +1433,15 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
         base backup.
        </para>
       </listitem>
+     <listitem>
+      <para>
+       Experiment with the parallel dump and restore modes of both
+       <application>pg_dump</> and <application>pg_restore</> and find the
+       optimal number of concurrent jobs to use. Dumping and restoring in
+       parallel by means of the <option>-j</> option should give you a
+       significantly higher performance over the serial mode.
+      </para>
+     </listitem>
       <listitem>
        <para>
         Consider whether the whole dump should be restored as a single
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml

index 6d0f214d423a37639d2b596bc729c81365782175..0186ce0938b5edf6c435429135920318b0436c96 100644 (file)
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -73,10 +73,12 @@ PostgreSQL documentation
     transfer mechanism. <application>pg_dump</application> can be used to
     backup an entire database, then <application>pg_restore</application>
     can be used to examine the archive and/or select which parts of the
-   database are to be restored. The most flexible output file format is
-   the <quote>custom</quote> format (<option>-Fc</option>). It allows
-   for selection and reordering of all archived items, and is compressed
-   by default.
+   database are to be restored. The most flexible output file formats are
+   the <quote>custom</quote> format (<option>-Fc</option>) and the
+   <quote>directory</quote> format(<option>-Fd</option>). They allow
+   for selection and reordering of all archived items, support parallel
+   restoration, and are compressed by default. The <quote>directory</quote>
+   format is the only format that supports parallel dumps.
    </para>
  
    <para>
@@ -251,7 +253,8 @@ PostgreSQL documentation
             can read. A directory format archive can be manipulated with
             standard Unix tools; for example, files in an uncompressed archive
             can be compressed with the <application>gzip</application> tool.
-           This format is compressed by default.
+           This format is compressed by default and also supports parallel
+           dumps.
            </para>
           </listitem>
          </varlistentry>
@@ -285,6 +288,62 @@ PostgreSQL documentation
        </listitem>
       </varlistentry>
  
+     <varlistentry>
+      <term><option>-j <replaceable class="parameter">njobs</replaceable></></term>
+      <term><option>--jobs=<replaceable class="parameter">njobs</replaceable></></term>
+      <listitem>
+       <para>
+        Run the dump in parallel by dumping <replaceable class="parameter">njobs</replaceable>
+        tables simultaneously. This option reduces the time of the dump but it also
+        increases the load on the database server. You can only use this option with the
+        directory output format because this is the only output format where multiple processes
+        can write their data at the same time.
+       </para>
+       <para>
+        <application>pg_dump</> will open <replaceable class="parameter">njobs</replaceable>
+        + 1 connections to the database, so make sure your <xref linkend="guc-max-connections">
+        setting is high enough to accommodate all connections.
+       </para>
+       <para>
+        Requesting exclusive locks on database objects while running a parallel dump could
+        cause the dump to fail. The reason is that the <application>pg_dump</> master process
+        requests shared locks on the objects that the worker processes are going to dump later
+        in order to
+        make sure that nobody deletes them and makes them go away while the dump is running.
+        If another client then requests an exclusive lock on a table, that lock will not be
+        granted but will be queued waiting for the shared lock of the master process to be
+        released.. Consequently any other access to the table will not be granted either and
+        will queue after the exclusive lock request. This includes the worker process trying
+        to dump the table. Without any precautions this would be a classic deadlock situation.
+        To detect this conflict, the <application>pg_dump</> worker process requests another
+        shared lock using the <literal>NOWAIT</> option. If the worker process is not granted
+        this shared lock, somebody else must have requested an exclusive lock in the meantime
+        and there is no way to continue with the dump, so <application>pg_dump</> has no choice
+        but to abort the dump.
+       </para>
+       <para>
+        For a consistent backup, the database server needs to support synchronized snapshots,
+        a feature that was introduced in <productname>PostgreSQL</productname> 9.2. With this
+        feature, database clients can ensure they see the same dataset even though they use
+        different connections. <command>pg_dump -j</command> uses multiple database 
+        connections; it connects to the database once with the master process and
+        once again for each worker job. Without the sychronized snapshot feature, the
+        different worker jobs wouldn't be guaranteed to see the same data in each connection,
+        which could lead to an inconsistent backup.
+       </para>
+       <para>
+        If you want to run a parallel dump of a pre-9.2 server, you need to make sure that the
+        database content doesn't change from between the time the master connects to the
+        database until the last worker job has connected to the database. The easiest way to
+        do this is to halt any data modifying processes (DDL and DML) accessing the database
+        before starting the backup. You also need to specify the
+        <option>--no-synchronized-snapshots</option> parameter when running
+        <command>pg_dump -j</command> against a pre-9.2 <productname>PostgreSQL</productname>
+        server.
+       </para>
+      </listitem>
+     </varlistentry>
+
       <varlistentry>
        <term><option>-n <replaceable class="parameter">schema</replaceable></option></term>
        <term><option>--schema=<replaceable class="parameter">schema</replaceable></option></term>
@@ -690,6 +749,17 @@ PostgreSQL documentation
        </listitem>
       </varlistentry>
  
+     <varlistentry>
+      <term><option>--no-synchronized-snapshots</></term>
+      <listitem>
+       <para>
+        This option allows running <command>pg_dump -j</> against a pre-9.2
+        server, see the documentation of the <option>-j</option> parameter
+        for more details.
+       </para>
+      </listitem>
+     </varlistentry>
+
       <varlistentry>
        <term><option>--no-tablespaces</option></term>
        <listitem>
@@ -1082,6 +1152,15 @@ CREATE DATABASE foo WITH TEMPLATE template0;
  </screen>
    </para>
  
+  <para>
+   To dump a database into a directory-format archive in parallel with
+   5 worker jobs:
+
+<screen>
+<prompt>$</prompt> <userinput>pg_dump -Fd mydb -j 5 -f dumpdir</userinput>
+</screen>
+  </para>
+
    <para>
     To reload an archive file into a (freshly created) database named
     <literal>newdb</>:
diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile

index a6ab39d34782f02246a60f79d7d8884b43e3d67c..6336edc65bd026be960040eb47df3fc51c164657 100644 (file)
--- a/src/bin/pg_dump/Makefile
+++ b/src/bin/pg_dump/Makefile
@@ -19,7 +19,7 @@ include $(top_builddir)/src/Makefile.global
  override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
  
  OBJS=  pg_backup_archiver.o pg_backup_db.o pg_backup_custom.o \
-       pg_backup_null.o pg_backup_tar.o \
+       pg_backup_null.o pg_backup_tar.o parallel.o \
         pg_backup_directory.o dumputils.o compress_io.o $(WIN32RES)
  
  KEYWRDOBJS = keywords.o kwlookup.o
diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c

index 768b923ae5fd73ae61956850795c1436c85852ef..0308f66c49f54f717ad1106ddf5bf950ebc67762 100644 (file)
--- a/src/bin/pg_dump/compress_io.c
+++ b/src/bin/pg_dump/compress_io.c
@@ -54,6 +54,7 @@
  
  #include "compress_io.h"
  #include "dumputils.h"
+#include "parallel.h"
  
  /*----------------------
   * Compressor API
@@ -182,6 +183,9 @@ size_t
  WriteDataToArchive(ArchiveHandle *AH, CompressorState *cs,
                                    const void *data, size_t dLen)
  {
+       /* Are we aborting? */
+       checkAborting(AH);
+
         switch (cs->comprAlg)
         {
                 case COMPR_ALG_LIBZ:
@@ -351,6 +355,9 @@ ReadDataFromArchiveZlib(ArchiveHandle *AH, ReadFunc readF)
         /* no minimal chunk size for zlib */
         while ((cnt = readF(AH, &buf, &buflen)))
         {
+               /* Are we aborting? */
+               checkAborting(AH);
+
                 zp->next_in = (void *) buf;
                 zp->avail_in = cnt;
  
@@ -411,6 +418,9 @@ ReadDataFromArchiveNone(ArchiveHandle *AH, ReadFunc readF)
  
         while ((cnt = readF(AH, &buf, &buflen)))
         {
+               /* Are we aborting? */
+               checkAborting(AH);
+
                 ahwrite(buf, 1, cnt, AH);
         }
  
diff --git a/src/bin/pg_dump/dumputils.c b/src/bin/pg_dump/dumputils.c

index 0a09882f5dc44cc39347bf99fce08682a4b24b33..7322f1a8257d8f0abb1325724167945f4516fd35 100644 (file)
--- a/src/bin/pg_dump/dumputils.c
+++ b/src/bin/pg_dump/dumputils.c
@@ -38,6 +38,7 @@ static struct
  }      on_exit_nicely_list[MAX_ON_EXIT_NICELY];
  
  static int     on_exit_nicely_index;
+void           (*on_exit_msg_func) (const char *modulename, const char *fmt, va_list ap) = vwrite_msg;
  
  #define supports_grant_options(version) ((version) >= 70400)
  
@@ -48,11 +49,21 @@ static bool parseAclItem(const char *item, const char *type,
  static char *copyAclUserName(PQExpBuffer output, char *input);
  static void AddAcl(PQExpBuffer aclbuf, const char *keyword,
            const char *subname);
+static PQExpBuffer getThreadLocalPQExpBuffer(void);
  
  #ifdef WIN32
+static void shutdown_parallel_dump_utils(int code, void *unused);
  static bool parallel_init_done = false;
  static DWORD tls_index;
  static DWORD mainThreadId;
+
+static void
+shutdown_parallel_dump_utils(int code, void *unused)
+{
+       /* Call the cleanup function only from the main thread */
+       if (mainThreadId == GetCurrentThreadId())
+               WSACleanup();
+}
  #endif
  
  void
@@ -61,23 +72,29 @@ init_parallel_dump_utils(void)
  #ifdef WIN32
         if (!parallel_init_done)
         {
+               WSADATA         wsaData;
+               int                     err;
+
                 tls_index = TlsAlloc();
-               parallel_init_done = true;
                 mainThreadId = GetCurrentThreadId();
+               err = WSAStartup(MAKEWORD(2, 2), &wsaData);
+               if (err != 0)
+               {
+                       fprintf(stderr, _("WSAStartup failed: %d\n"), err);
+                       exit_nicely(1);
+               }
+               on_exit_nicely(shutdown_parallel_dump_utils, NULL);
+               parallel_init_done = true;
         }
  #endif
  }
  
  /*
- *     Quotes input string if it's not a legitimate SQL identifier as-is.
- *
- *     Note that the returned string must be used before calling fmtId again,
- *     since we re-use the same return buffer each time.  Non-reentrant but
- *     reduces memory leakage. (On Windows the memory leakage will be one buffer
- *     per thread, which is at least better than one per call).
+ * Non-reentrant but reduces memory leakage. (On Windows the memory leakage
+ * will be one buffer per thread, which is at least better than one per call).
   */
-const char *
-fmtId(const char *rawid)
+static PQExpBuffer
+getThreadLocalPQExpBuffer(void)
  {
         /*
          * The Tls code goes awry if we use a static var, so we provide for both
@@ -86,9 +103,6 @@ fmtId(const char *rawid)
         static PQExpBuffer s_id_return = NULL;
         PQExpBuffer id_return;
  
-       const char *cp;
-       bool            need_quotes = false;
-
  #ifdef WIN32
         if (parallel_init_done)
                 id_return = (PQExpBuffer) TlsGetValue(tls_index);               /* 0 when not set */
@@ -118,6 +132,23 @@ fmtId(const char *rawid)
  
         }
  
+       return id_return;
+}
+
+/*
+ *     Quotes input string if it's not a legitimate SQL identifier as-is.
+ *
+ *     Note that the returned string must be used before calling fmtId again,
+ *     since we re-use the same return buffer each time.
+ */
+const char *
+fmtId(const char *rawid)
+{
+       PQExpBuffer id_return = getThreadLocalPQExpBuffer();
+
+       const char *cp;
+       bool            need_quotes = false;
+
         /*
          * These checks need to match the identifier production in scan.l. Don't
          * use islower() etc.
@@ -185,6 +216,35 @@ fmtId(const char *rawid)
         return id_return->data;
  }
  
+/*
+ * fmtQualifiedId - convert a qualified name to the proper format for
+ * the source database.
+ *
+ * Like fmtId, use the result before calling again.
+ *
+ * Since we call fmtId and it also uses getThreadLocalPQExpBuffer() we cannot
+ * use it until we're finished with calling fmtId().
+ */
+const char *
+fmtQualifiedId(int remoteVersion, const char *schema, const char *id)
+{
+       PQExpBuffer id_return;
+       PQExpBuffer lcl_pqexp = createPQExpBuffer();
+
+       /* Suppress schema name if fetching from pre-7.3 DB */
+       if (remoteVersion >= 70300 && schema && *schema)
+       {
+               appendPQExpBuffer(lcl_pqexp, "%s.", fmtId(schema));
+       }
+       appendPQExpBuffer(lcl_pqexp, "%s", fmtId(id));
+
+       id_return = getThreadLocalPQExpBuffer();
+
+       appendPQExpBuffer(id_return, "%s", lcl_pqexp->data);
+       destroyPQExpBuffer(lcl_pqexp);
+
+       return id_return->data;
+}
  
  /*
   * Convert a string value to an SQL string literal and append it to
@@ -1315,7 +1375,7 @@ exit_horribly(const char *modulename, const char *fmt,...)
         va_list         ap;
  
         va_start(ap, fmt);
-       vwrite_msg(modulename, fmt, ap);
+       on_exit_msg_func(modulename, fmt, ap);
         va_end(ap);
  
         exit_nicely(1);
diff --git a/src/bin/pg_dump/dumputils.h b/src/bin/pg_dump/dumputils.h

index a4b351d03cae74c6792795114034684e6afc136b..90da787c5cf1e37cb716ecebf1a7a4bc3ad1706b 100644 (file)
--- a/src/bin/pg_dump/dumputils.h
+++ b/src/bin/pg_dump/dumputils.h
@@ -29,14 +29,14 @@ typedef enum                                        /* bits returned by set_dump_section */
  
  typedef struct SimpleStringListCell
  {
-    struct SimpleStringListCell *next;
-    char        val[1];         /* VARIABLE LENGTH FIELD */
+       struct SimpleStringListCell *next;
+       char            val[1];                 /* VARIABLE LENGTH FIELD */
  } SimpleStringListCell;
  
  typedef struct SimpleStringList
  {
-    SimpleStringListCell *head;
-    SimpleStringListCell *tail;
+       SimpleStringListCell *head;
+       SimpleStringListCell *tail;
  } SimpleStringList;
  
  
@@ -47,6 +47,8 @@ extern const char *progname;
  
  extern void init_parallel_dump_utils(void);
  extern const char *fmtId(const char *identifier);
+extern const char *fmtQualifiedId(int remoteVersion,
+                          const char *schema, const char *id);
  extern void appendStringLiteral(PQExpBuffer buf, const char *str,
                                         int encoding, bool std_strings);
  extern void appendStringLiteralConn(PQExpBuffer buf, const char *str,
@@ -85,11 +87,12 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0)));
  extern void
  exit_horribly(const char *modulename, const char *fmt,...)
  __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3), noreturn));
+extern void (*on_exit_msg_func) (const char *modulename, const char *fmt, va_list ap)
+                       __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0)));
  extern void on_exit_nicely(on_exit_nicely_callback function, void *arg);
  extern void exit_nicely(int code) __attribute__((noreturn));
  
  extern void simple_string_list_append(SimpleStringList *list, const char *val);
  extern bool simple_string_list_member(SimpleStringList *list, const char *val);
  
-
  #endif   /* DUMPUTILS_H */
diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c

new file mode 100644 (file)

index 0000000..dedf431
--- /dev/null
+++ b/src/bin/pg_dump/parallel.c
@@ -0,0 +1,1293 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.c
+ *
+ *     Parallel support for the pg_dump archiver
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *     The author is not responsible for loss or damages that may
+ *     result from its use.
+ *
+ * IDENTIFICATION
+ *             src/bin/pg_dump/parallel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "pg_backup_db.h"
+
+#include "dumputils.h"
+#include "parallel.h"
+
+#ifndef WIN32
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "signal.h"
+#include <unistd.h>
+#include <fcntl.h>
+#endif
+
+#define PIPE_READ                                                      0
+#define PIPE_WRITE                                                     1
+
+/* file-scope variables */
+#ifdef WIN32
+static unsigned int tMasterThreadId = 0;
+static HANDLE termEvent = INVALID_HANDLE_VALUE;
+static int     pgpipe(int handles[2]);
+static int     piperead(int s, char *buf, int len);
+
+/*
+ * Structure to hold info passed by _beginthreadex() to the function it calls
+ * via its single allowed argument.
+ */
+typedef struct
+{
+       ArchiveHandle *AH;
+       RestoreOptions *ropt;
+       int                     worker;
+       int                     pipeRead;
+       int                     pipeWrite;
+} WorkerInfo;
+
+#define pipewrite(a,b,c)       send(a,b,c,0)
+#else
+/*
+ * aborting is only ever used in the master, the workers are fine with just
+ * wantAbort.
+ */
+static bool aborting = false;
+static volatile sig_atomic_t wantAbort = 0;
+
+#define pgpipe(a)                      pipe(a)
+#define piperead(a,b,c)                read(a,b,c)
+#define pipewrite(a,b,c)       write(a,b,c)
+#endif
+
+typedef struct ShutdownInformation
+{
+       ParallelState *pstate;
+       Archive    *AHX;
+} ShutdownInformation;
+
+static ShutdownInformation shutdown_info;
+
+static const char *modulename = gettext_noop("parallel archiver");
+
+static ParallelSlot *GetMyPSlot(ParallelState *pstate);
+static void
+parallel_exit_msg_func(const char *modulename,
+                                          const char *fmt, va_list ap)
+__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0)));
+static void
+parallel_msg_master(ParallelSlot *slot, const char *modulename,
+                                       const char *fmt, va_list ap)
+__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0)));
+static void archive_close_connection(int code, void *arg);
+static void ShutdownWorkersHard(ParallelState *pstate);
+static void WaitForTerminatingWorkers(ParallelState *pstate);
+
+#ifndef WIN32
+static void sigTermHandler(int signum);
+#endif
+static void SetupWorker(ArchiveHandle *AH, int pipefd[2], int worker,
+                       RestoreOptions *ropt);
+static bool HasEveryWorkerTerminated(ParallelState *pstate);
+
+static void lockTableNoWait(ArchiveHandle *AH, TocEntry *te);
+static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]);
+static char *getMessageFromMaster(int pipefd[2]);
+static void sendMessageToMaster(int pipefd[2], const char *str);
+static int     select_loop(int maxFd, fd_set *workerset);
+static char *getMessageFromWorker(ParallelState *pstate,
+                                        bool do_wait, int *worker);
+static void sendMessageToWorker(ParallelState *pstate,
+                                       int worker, const char *str);
+static char *readMessageFromPipe(int fd);
+
+#define messageStartsWith(msg, prefix) \
+       (strncmp(msg, prefix, strlen(prefix)) == 0)
+#define messageEquals(msg, pattern) \
+       (strcmp(msg, pattern) == 0)
+
+static ParallelSlot *
+GetMyPSlot(ParallelState *pstate)
+{
+       int                     i;
+
+       for (i = 0; i < pstate->numWorkers; i++)
+#ifdef WIN32
+               if (pstate->parallelSlot[i].threadId == GetCurrentThreadId())
+#else
+               if (pstate->parallelSlot[i].pid == getpid())
+#endif
+                       return &(pstate->parallelSlot[i]);
+
+       return NULL;
+}
+
+/*
+ * This is the function that will be called from exit_horribly() to print the
+ * error message. If the worker process does exit_horribly(), we forward its
+ * last words to the master process. The master process then does
+ * exit_horribly() with this error message itself and prints it normally.
+ * After printing the message, exit_horribly() on the master will shut down
+ * the remaining worker processes.
+ */
+static void
+parallel_exit_msg_func(const char *modulename, const char *fmt, va_list ap)
+{
+       ParallelState *pstate = shutdown_info.pstate;
+       ParallelSlot *slot;
+
+       Assert(pstate);
+
+       slot = GetMyPSlot(pstate);
+
+       if (!slot)
+               /* We're the parent, just write the message out */
+               vwrite_msg(modulename, fmt, ap);
+       else
+               /* If we're a worker process, send the msg to the master process */
+               parallel_msg_master(slot, modulename, fmt, ap);
+}
+
+/* Sends the error message from the worker to the master process */
+static void
+parallel_msg_master(ParallelSlot *slot, const char *modulename,
+                                       const char *fmt, va_list ap)
+{
+       char            buf[512];
+       int                     pipefd[2];
+
+       pipefd[PIPE_READ] = slot->pipeRevRead;
+       pipefd[PIPE_WRITE] = slot->pipeRevWrite;
+
+       strcpy(buf, "ERROR ");
+       vsnprintf(buf + strlen("ERROR "),
+                         sizeof(buf) - strlen("ERROR "), fmt, ap);
+
+       sendMessageToMaster(pipefd, buf);
+}
+
+/*
+ * pg_dump and pg_restore register the Archive pointer for the exit handler
+ * (called from exit_horribly). This function mainly exists so that we can
+ * keep shutdown_info in file scope only.
+ */
+void
+on_exit_close_archive(Archive *AHX)
+{
+       shutdown_info.AHX = AHX;
+       on_exit_nicely(archive_close_connection, &shutdown_info);
+}
+
+/*
+ * This function can close archives in both the parallel and non-parallel
+ * case.
+ */
+static void
+archive_close_connection(int code, void *arg)
+{
+       ShutdownInformation *si = (ShutdownInformation *) arg;
+
+       if (si->pstate)
+       {
+               ParallelSlot *slot = GetMyPSlot(si->pstate);
+
+               if (!slot)
+               {
+                       /*
+                        * We're the master: We have already printed out the message
+                        * passed to exit_horribly() either from the master itself or from
+                        * a worker process. Now we need to close our own database
+                        * connection (only open during parallel dump but not restore) and
+                        * shut down the remaining workers.
+                        */
+                       DisconnectDatabase(si->AHX);
+#ifndef WIN32
+
+                       /*
+                        * Setting aborting to true switches to best-effort-mode
+                        * (send/receive but ignore errors) in communicating with our
+                        * workers.
+                        */
+                       aborting = true;
+#endif
+                       ShutdownWorkersHard(si->pstate);
+               }
+               else if (slot->args->AH)
+                       DisconnectDatabase(&(slot->args->AH->public));
+       }
+       else if (si->AHX)
+               DisconnectDatabase(si->AHX);
+}
+
+/*
+ * If we have one worker that terminates for some reason, we'd like the other
+ * threads to terminate as well (and not finish with their 70 GB table dump
+ * first...). Now in UNIX we can just kill these processes, and let the signal
+ * handler set wantAbort to 1. In Windows we set a termEvent and this serves
+ * as the signal for everyone to terminate.
+ */
+void
+checkAborting(ArchiveHandle *AH)
+{
+#ifdef WIN32
+       if (WaitForSingleObject(termEvent, 0) == WAIT_OBJECT_0)
+#else
+       if (wantAbort)
+#endif
+               exit_horribly(modulename, "worker is terminating\n");
+}
+
+/*
+ * Shut down any remaining workers, this has an implicit do_wait == true.
+ *
+ * The fastest way we can make the workers terminate gracefully is when
+ * they are listening for new commands and we just tell them to terminate.
+ */
+static void
+ShutdownWorkersHard(ParallelState *pstate)
+{
+#ifndef WIN32
+       int                     i;
+
+       signal(SIGPIPE, SIG_IGN);
+
+       /*
+        * Close our write end of the sockets so that the workers know they can
+        * exit.
+        */
+       for (i = 0; i < pstate->numWorkers; i++)
+               closesocket(pstate->parallelSlot[i].pipeWrite);
+
+       for (i = 0; i < pstate->numWorkers; i++)
+               kill(pstate->parallelSlot[i].pid, SIGTERM);
+#else
+       /* The workers monitor this event via checkAborting(). */
+       SetEvent(termEvent);
+#endif
+
+       WaitForTerminatingWorkers(pstate);
+}
+
+/*
+ * Wait for the termination of the processes using the OS-specific method.
+ */
+static void
+WaitForTerminatingWorkers(ParallelState *pstate)
+{
+       while (!HasEveryWorkerTerminated(pstate))
+       {
+               ParallelSlot *slot = NULL;
+               int                     j;
+
+#ifndef WIN32
+               int                     status;
+               pid_t           pid = wait(&status);
+
+               for (j = 0; j < pstate->numWorkers; j++)
+                       if (pstate->parallelSlot[j].pid == pid)
+                               slot = &(pstate->parallelSlot[j]);
+#else
+               uintptr_t       hThread;
+               DWORD           ret;
+               uintptr_t  *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers);
+               int                     nrun = 0;
+
+               for (j = 0; j < pstate->numWorkers; j++)
+                       if (pstate->parallelSlot[j].workerStatus != WRKR_TERMINATED)
+                       {
+                               lpHandles[nrun] = pstate->parallelSlot[j].hThread;
+                               nrun++;
+                       }
+               ret = WaitForMultipleObjects(nrun, (HANDLE *) lpHandles, false, INFINITE);
+               Assert(ret != WAIT_FAILED);
+               hThread = lpHandles[ret - WAIT_OBJECT_0];
+
+               for (j = 0; j < pstate->numWorkers; j++)
+                       if (pstate->parallelSlot[j].hThread == hThread)
+                               slot = &(pstate->parallelSlot[j]);
+
+               free(lpHandles);
+#endif
+               Assert(slot);
+
+               slot->workerStatus = WRKR_TERMINATED;
+       }
+       Assert(HasEveryWorkerTerminated(pstate));
+}
+
+#ifndef WIN32
+/* Signal handling (UNIX only) */
+static void
+sigTermHandler(int signum)
+{
+       wantAbort = 1;
+}
+#endif
+
+/*
+ * This function is called by both UNIX and Windows variants to set up a
+ * worker process.
+ */
+static void
+SetupWorker(ArchiveHandle *AH, int pipefd[2], int worker,
+                       RestoreOptions *ropt)
+{
+       /*
+        * Call the setup worker function that's defined in the ArchiveHandle.
+        *
+        * We get the raw connection only for the reason that we can close it
+        * properly when we shut down. This happens only that way when it is
+        * brought down because of an error.
+        */
+       (AH->SetupWorkerPtr) ((Archive *) AH, ropt);
+
+       Assert(AH->connection != NULL);
+
+       WaitForCommands(AH, pipefd);
+
+       closesocket(pipefd[PIPE_READ]);
+       closesocket(pipefd[PIPE_WRITE]);
+}
+
+#ifdef WIN32
+static unsigned __stdcall
+init_spawned_worker_win32(WorkerInfo *wi)
+{
+       ArchiveHandle *AH;
+       int                     pipefd[2] = {wi->pipeRead, wi->pipeWrite};
+       int                     worker = wi->worker;
+       RestoreOptions *ropt = wi->ropt;
+
+       AH = CloneArchive(wi->AH);
+
+       free(wi);
+       SetupWorker(AH, pipefd, worker, ropt);
+
+       DeCloneArchive(AH);
+       _endthreadex(0);
+       return 0;
+}
+#endif
+
+/*
+ * This function starts the parallel dump or restore by spawning off the
+ * worker processes in both Unix and Windows. For Windows, it creates a number
+ * of threads while it does a fork() on Unix.
+ */
+ParallelState *
+ParallelBackupStart(ArchiveHandle *AH, RestoreOptions *ropt)
+{
+       ParallelState *pstate;
+       int                     i;
+       const size_t slotSize = AH->public.numWorkers * sizeof(ParallelSlot);
+
+       Assert(AH->public.numWorkers > 0);
+
+       /* Ensure stdio state is quiesced before forking */
+       fflush(NULL);
+
+       pstate = (ParallelState *) pg_malloc(sizeof(ParallelState));
+
+       pstate->numWorkers = AH->public.numWorkers;
+       pstate->parallelSlot = NULL;
+
+       if (AH->public.numWorkers == 1)
+               return pstate;
+
+       pstate->parallelSlot = (ParallelSlot *) pg_malloc(slotSize);
+       memset((void *) pstate->parallelSlot, 0, slotSize);
+
+       /*
+        * Set the pstate in the shutdown_info. The exit handler uses pstate if
+        * set and falls back to AHX otherwise.
+        */
+       shutdown_info.pstate = pstate;
+       on_exit_msg_func = parallel_exit_msg_func;
+
+#ifdef WIN32
+       tMasterThreadId = GetCurrentThreadId();
+       termEvent = CreateEvent(NULL, true, false, "Terminate");
+#else
+       signal(SIGTERM, sigTermHandler);
+       signal(SIGINT, sigTermHandler);
+       signal(SIGQUIT, sigTermHandler);
+#endif
+
+       for (i = 0; i < pstate->numWorkers; i++)
+       {
+#ifdef WIN32
+               WorkerInfo *wi;
+               uintptr_t       handle;
+#else
+               pid_t           pid;
+#endif
+               int                     pipeMW[2],
+                                       pipeWM[2];
+
+               if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0)
+                       exit_horribly(modulename,
+                                                 "Cannot create communication channels: %s\n",
+                                                 strerror(errno));
+
+               pstate->parallelSlot[i].workerStatus = WRKR_IDLE;
+               pstate->parallelSlot[i].args = (ParallelArgs *) pg_malloc(sizeof(ParallelArgs));
+               pstate->parallelSlot[i].args->AH = NULL;
+               pstate->parallelSlot[i].args->te = NULL;
+#ifdef WIN32
+               /* Allocate a new structure for every worker */
+               wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo));
+
+               wi->ropt = ropt;
+               wi->worker = i;
+               wi->AH = AH;
+               wi->pipeRead = pstate->parallelSlot[i].pipeRevRead = pipeMW[PIPE_READ];
+               wi->pipeWrite = pstate->parallelSlot[i].pipeRevWrite = pipeWM[PIPE_WRITE];
+
+               handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32,
+                                                               wi, 0, &(pstate->parallelSlot[i].threadId));
+               pstate->parallelSlot[i].hThread = handle;
+#else
+               pid = fork();
+               if (pid == 0)
+               {
+                       /* we are the worker */
+                       int                     j;
+                       int                     pipefd[2] = {pipeMW[PIPE_READ], pipeWM[PIPE_WRITE]};
+
+                       /*
+                        * Store the fds for the reverse communication in pstate. Actually
+                        * we only use this in case of an error and don't use pstate
+                        * otherwise in the worker process. On Windows we write to the
+                        * global pstate, in Unix we write to our process-local copy but
+                        * that's also where we'd retrieve this information back from.
+                        */
+                       pstate->parallelSlot[i].pipeRevRead = pipefd[PIPE_READ];
+                       pstate->parallelSlot[i].pipeRevWrite = pipefd[PIPE_WRITE];
+                       pstate->parallelSlot[i].pid = getpid();
+
+                       /*
+                        * Call CloneArchive on Unix as well even though technically we
+                        * don't need to because fork() gives us a copy in our own address
+                        * space already. But CloneArchive resets the state information
+                        * and also clones the database connection (for parallel dump)
+                        * which both seem kinda helpful.
+                        */
+                       pstate->parallelSlot[i].args->AH = CloneArchive(AH);
+
+                       /* close read end of Worker -> Master */
+                       closesocket(pipeWM[PIPE_READ]);
+                       /* close write end of Master -> Worker */
+                       closesocket(pipeMW[PIPE_WRITE]);
+
+                       /*
+                        * Close all inherited fds for communication of the master with
+                        * the other workers.
+                        */
+                       for (j = 0; j < i; j++)
+                       {
+                               closesocket(pstate->parallelSlot[j].pipeRead);
+                               closesocket(pstate->parallelSlot[j].pipeWrite);
+                       }
+
+                       SetupWorker(pstate->parallelSlot[i].args->AH, pipefd, i, ropt);
+
+                       exit(0);
+               }
+               else if (pid < 0)
+                       /* fork failed */
+                       exit_horribly(modulename,
+                                                 "could not create worker process: %s\n",
+                                                 strerror(errno));
+
+               /* we are the Master, pid > 0 here */
+               Assert(pid > 0);
+
+               /* close read end of Master -> Worker */
+               closesocket(pipeMW[PIPE_READ]);
+               /* close write end of Worker -> Master */
+               closesocket(pipeWM[PIPE_WRITE]);
+
+               pstate->parallelSlot[i].pid = pid;
+#endif
+
+               pstate->parallelSlot[i].pipeRead = pipeWM[PIPE_READ];
+               pstate->parallelSlot[i].pipeWrite = pipeMW[PIPE_WRITE];
+       }
+
+       return pstate;
+}
+
+/*
+ * Tell all of our workers to terminate.
+ *
+ * Pretty straightforward routine, first we tell everyone to terminate, then
+ * we listen to the workers' replies and finally close the sockets that we
+ * have used for communication.
+ */
+void
+ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate)
+{
+       int                     i;
+
+       if (pstate->numWorkers == 1)
+               return;
+
+       Assert(IsEveryWorkerIdle(pstate));
+
+       /* close the sockets so that the workers know they can exit */
+       for (i = 0; i < pstate->numWorkers; i++)
+       {
+               closesocket(pstate->parallelSlot[i].pipeRead);
+               closesocket(pstate->parallelSlot[i].pipeWrite);
+       }
+       WaitForTerminatingWorkers(pstate);
+
+       /*
+        * Remove the pstate again, so the exit handler in the parent will now
+        * again fall back to closing AH->connection (if connected).
+        */
+       shutdown_info.pstate = NULL;
+
+       free(pstate->parallelSlot);
+       free(pstate);
+}
+
+
+/*
+ * The sequence is the following (for dump, similar for restore):
+ *
+ * The master process starts the parallel backup in ParllelBackupStart, this
+ * forks the worker processes which enter WaitForCommand().
+ *
+ * The master process dispatches an individual work item to one of the worker
+ * processes in DispatchJobForTocEntry(). It calls
+ * AH->MasterStartParallelItemPtr, a routine of the output format. This
+ * function's arguments are the parents archive handle AH (containing the full
+ * catalog information), the TocEntry that the worker should work on and a
+ * T_Action act indicating whether this is a backup or a restore item. The
+ * function then converts the TocEntry assignment into a string that is then
+ * sent over to the worker process. In the simplest case that would be
+ * something like "DUMP 1234", with 1234 being the TocEntry id.
+ *
+ * The worker receives the message in the routine pointed to by
+ * WorkerJobDumpPtr or WorkerJobRestorePtr. These are also pointers to
+ * corresponding routines of the respective output format, e.g.
+ * _WorkerJobDumpDirectory().
+ *
+ * Remember that we have forked off the workers only after we have read in the
+ * catalog. That's why our worker processes can also access the catalog
+ * information. Now they re-translate the textual representation to a TocEntry
+ * on their side and do the required action (restore or dump).
+ *
+ * The result is again a textual string that is sent back to the master and is
+ * interpreted by AH->MasterEndParallelItemPtr. This function can update state
+ * or catalog information on the master's side, depending on the reply from
+ * the worker process. In the end it returns status which is 0 for successful
+ * execution.
+ *
+ * ---------------------------------------------------------------------
+ * Master                                                                      Worker
+ *
+ *                                                                                     enters WaitForCommands()
+ * DispatchJobForTocEntry(...te...)
+ *
+ * [ Worker is IDLE ]
+ *
+ * arg = (MasterStartParallelItemPtr)()
+ * send: DUMP arg
+ *                                                                                     receive: DUMP arg
+ *                                                                                     str = (WorkerJobDumpPtr)(arg)
+ * [ Worker is WORKING ]                                       ... gets te from arg ...
+ *                                                                                     ... dump te ...
+ *                                                                                     send: OK DUMP info
+ *
+ * In ListenToWorkers():
+ *
+ * [ Worker is FINISHED ]
+ * receive: OK DUMP info
+ * status = (MasterEndParallelItemPtr)(info)
+ *
+ * In ReapWorkerStatus(&ptr):
+ * *ptr = status;
+ * [ Worker is IDLE ]
+ * ---------------------------------------------------------------------
+ */
+void
+DispatchJobForTocEntry(ArchiveHandle *AH, ParallelState *pstate, TocEntry *te,
+                                          T_Action act)
+{
+       int                     worker;
+       char       *arg;
+
+       /* our caller makes sure that at least one worker is idle */
+       Assert(GetIdleWorker(pstate) != NO_SLOT);
+       worker = GetIdleWorker(pstate);
+       Assert(worker != NO_SLOT);
+
+       arg = (AH->MasterStartParallelItemPtr) (AH, te, act);
+
+       sendMessageToWorker(pstate, worker, arg);
+
+       pstate->parallelSlot[worker].workerStatus = WRKR_WORKING;
+       pstate->parallelSlot[worker].args->te = te;
+}
+
+/*
+ * Find the first free parallel slot (if any).
+ */
+int
+GetIdleWorker(ParallelState *pstate)
+{
+       int                     i;
+
+       for (i = 0; i < pstate->numWorkers; i++)
+               if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE)
+                       return i;
+       return NO_SLOT;
+}
+
+/*
+ * Return true iff every worker process is in the WRKR_TERMINATED state.
+ */
+static bool
+HasEveryWorkerTerminated(ParallelState *pstate)
+{
+       int                     i;
+
+       for (i = 0; i < pstate->numWorkers; i++)
+               if (pstate->parallelSlot[i].workerStatus != WRKR_TERMINATED)
+                       return false;
+       return true;
+}
+
+/*
+ * Return true iff every worker is in the WRKR_IDLE state.
+ */
+bool
+IsEveryWorkerIdle(ParallelState *pstate)
+{
+       int                     i;
+
+       for (i = 0; i < pstate->numWorkers; i++)
+               if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE)
+                       return false;
+       return true;
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * One danger of the parallel backup is a possible deadlock:
+ *
+ * 1) Master dumps the schema and locks all tables in ACCESS SHARE mode.
+ * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted
+ *       because the master holds a conflicting ACCESS SHARE lock).
+ * 3) The worker process also requests an ACCESS SHARE lock to read the table.
+ *       The worker's not granted that lock but is enqueued behind the ACCESS
+ *       EXCLUSIVE lock request.
+ * ---------------------------------------------------------------------
+ *
+ * Now what we do here is to just request a lock in ACCESS SHARE but with
+ * NOWAIT in the worker prior to touching the table. If we don't get the lock,
+ * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and
+ * are good to just fail the whole backup because we have detected a deadlock.
+ */
+static void
+lockTableNoWait(ArchiveHandle *AH, TocEntry *te)
+{
+       Archive    *AHX = (Archive *) AH;
+       const char *qualId;
+       PQExpBuffer query = createPQExpBuffer();
+       PGresult   *res;
+
+       Assert(AH->format == archDirectory);
+       Assert(strcmp(te->desc, "BLOBS") != 0);
+
+       appendPQExpBuffer(query,
+                                         "SELECT pg_namespace.nspname,"
+                                         "       pg_class.relname "
+                                         "  FROM pg_class "
+                                       "  JOIN pg_namespace on pg_namespace.oid = relnamespace "
+                                         " WHERE pg_class.oid = %d", te->catalogId.oid);
+
+       res = PQexec(AH->connection, query->data);
+
+       if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
+               exit_horribly(modulename,
+                                         "could not get relation name for oid %d: %s\n",
+                                         te->catalogId.oid, PQerrorMessage(AH->connection));
+
+       resetPQExpBuffer(query);
+
+       qualId = fmtQualifiedId(AHX->remoteVersion,
+                                                       PQgetvalue(res, 0, 0),
+                                                       PQgetvalue(res, 0, 1));
+
+       appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT",
+                                         qualId);
+       PQclear(res);
+
+       res = PQexec(AH->connection, query->data);
+
+       if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
+               exit_horribly(modulename,
+                                         "could not obtain lock on relation \"%s\". This "
+                        "usually means that someone requested an ACCESS EXCLUSIVE lock "
+                         "on the table after the pg_dump parent process has gotten the "
+                                         "initial ACCESS SHARE lock on the table.\n", qualId);
+
+       PQclear(res);
+       destroyPQExpBuffer(query);
+}
+
+/*
+ * That's the main routine for the worker.
+ * When it starts up it enters this routine and waits for commands from the
+ * master process. After having processed a command it comes back to here to
+ * wait for the next command. Finally it will receive a TERMINATE command and
+ * exit.
+ */
+static void
+WaitForCommands(ArchiveHandle *AH, int pipefd[2])
+{
+       char       *command;
+       DumpId          dumpId;
+       int                     nBytes;
+       char       *str = NULL;
+       TocEntry   *te;
+
+       for (;;)
+       {
+               if (!(command = getMessageFromMaster(pipefd)))
+               {
+                       PQfinish(AH->connection);
+                       AH->connection = NULL;
+                       return;
+               }
+
+               if (messageStartsWith(command, "DUMP "))
+               {
+                       Assert(AH->format == archDirectory);
+                       sscanf(command + strlen("DUMP "), "%d%n", &dumpId, &nBytes);
+                       Assert(nBytes == strlen(command) - strlen("DUMP "));
+
+                       te = getTocEntryByDumpId(AH, dumpId);
+                       Assert(te != NULL);
+
+                       /*
+                        * Lock the table but with NOWAIT. Note that the parent is already
+                        * holding a lock. If we cannot acquire another ACCESS SHARE MODE
+                        * lock, then somebody else has requested an exclusive lock in the
+                        * meantime.  lockTableNoWait dies in this case to prevent a
+                        * deadlock.
+                        */
+                       if (strcmp(te->desc, "BLOBS") != 0)
+                               lockTableNoWait(AH, te);
+
+                       /*
+                        * The message we return here has been pg_malloc()ed and we are
+                        * responsible for free()ing it.
+                        */
+                       str = (AH->WorkerJobDumpPtr) (AH, te);
+                       Assert(AH->connection != NULL);
+                       sendMessageToMaster(pipefd, str);
+                       free(str);
+               }
+               else if (messageStartsWith(command, "RESTORE "))
+               {
+                       Assert(AH->format == archDirectory || AH->format == archCustom);
+                       Assert(AH->connection != NULL);
+
+                       sscanf(command + strlen("RESTORE "), "%d%n", &dumpId, &nBytes);
+                       Assert(nBytes == strlen(command) - strlen("RESTORE "));
+
+                       te = getTocEntryByDumpId(AH, dumpId);
+                       Assert(te != NULL);
+
+                       /*
+                        * The message we return here has been pg_malloc()ed and we are
+                        * responsible for free()ing it.
+                        */
+                       str = (AH->WorkerJobRestorePtr) (AH, te);
+                       Assert(AH->connection != NULL);
+                       sendMessageToMaster(pipefd, str);
+                       free(str);
+               }
+               else
+                       exit_horribly(modulename,
+                                                 "Unknown command on communication channel: %s\n",
+                                                 command);
+       }
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Note the status change:
+ *
+ * DispatchJobForTocEntry              WRKR_IDLE -> WRKR_WORKING
+ * ListenToWorkers                             WRKR_WORKING -> WRKR_FINISHED / WRKR_TERMINATED
+ * ReapWorkerStatus                            WRKR_FINISHED -> WRKR_IDLE
+ * ---------------------------------------------------------------------
+ *
+ * Just calling ReapWorkerStatus() when all workers are working might or might
+ * not give you an idle worker because you need to call ListenToWorkers() in
+ * between and only thereafter ReapWorkerStatus(). This is necessary in order
+ * to get and deal with the status (=result) of the worker's execution.
+ */
+void
+ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait)
+{
+       int                     worker;
+       char       *msg;
+
+       msg = getMessageFromWorker(pstate, do_wait, &worker);
+
+       if (!msg)
+       {
+               if (do_wait)
+                       exit_horribly(modulename, "A worker process died unexpectedly\n");
+               return;
+       }
+
+       if (messageStartsWith(msg, "OK "))
+       {
+               char       *statusString;
+               TocEntry   *te;
+
+               pstate->parallelSlot[worker].workerStatus = WRKR_FINISHED;
+               te = pstate->parallelSlot[worker].args->te;
+               if (messageStartsWith(msg, "OK RESTORE "))
+               {
+                       statusString = msg + strlen("OK RESTORE ");
+                       pstate->parallelSlot[worker].status =
+                               (AH->MasterEndParallelItemPtr)
+                               (AH, te, statusString, ACT_RESTORE);
+               }
+               else if (messageStartsWith(msg, "OK DUMP "))
+               {
+                       statusString = msg + strlen("OK DUMP ");
+                       pstate->parallelSlot[worker].status =
+                               (AH->MasterEndParallelItemPtr)
+                               (AH, te, statusString, ACT_DUMP);
+               }
+               else
+                       exit_horribly(modulename,
+                                                 "Invalid message received from worker: %s\n", msg);
+       }
+       else if (messageStartsWith(msg, "ERROR "))
+       {
+               Assert(AH->format == archDirectory || AH->format == archCustom);
+               pstate->parallelSlot[worker].workerStatus = WRKR_TERMINATED;
+               exit_horribly(modulename, "%s", msg + strlen("ERROR "));
+       }
+       else
+               exit_horribly(modulename, "Invalid message received from worker: %s\n", msg);
+
+       /* both Unix and Win32 return pg_malloc()ed space, so we free it */
+       free(msg);
+}
+
+/*
+ * This function is executed in the master process.
+ *
+ * This function is used to get the return value of a terminated worker
+ * process. If a process has terminated, its status is stored in *status and
+ * the id of the worker is returned.
+ */
+int
+ReapWorkerStatus(ParallelState *pstate, int *status)
+{
+       int                     i;
+
+       for (i = 0; i < pstate->numWorkers; i++)
+       {
+               if (pstate->parallelSlot[i].workerStatus == WRKR_FINISHED)
+               {
+                       *status = pstate->parallelSlot[i].status;
+                       pstate->parallelSlot[i].status = 0;
+                       pstate->parallelSlot[i].workerStatus = WRKR_IDLE;
+                       return i;
+               }
+       }
+       return NO_SLOT;
+}
+
+/*
+ * This function is executed in the master process.
+ *
+ * It looks for an idle worker process and only returns if there is one.
+ */
+void
+EnsureIdleWorker(ArchiveHandle *AH, ParallelState *pstate)
+{
+       int                     ret_worker;
+       int                     work_status;
+
+       for (;;)
+       {
+               int                     nTerm = 0;
+
+               while ((ret_worker = ReapWorkerStatus(pstate, &work_status)) != NO_SLOT)
+               {
+                       if (work_status != 0)
+                               exit_horribly(modulename, "Error processing a parallel work item.\n");
+
+                       nTerm++;
+               }
+
+               /*
+                * We need to make sure that we have an idle worker before dispatching
+                * the next item. If nTerm > 0 we already have that (quick check).
+                */
+               if (nTerm > 0)
+                       return;
+
+               /* explicit check for an idle worker */
+               if (GetIdleWorker(pstate) != NO_SLOT)
+                       return;
+
+               /*
+                * If we have no idle worker, read the result of one or more workers
+                * and loop the loop to call ReapWorkerStatus() on them
+                */
+               ListenToWorkers(AH, pstate, true);
+       }
+}
+
+/*
+ * This function is executed in the master process.
+ *
+ * It waits for all workers to terminate.
+ */
+void
+EnsureWorkersFinished(ArchiveHandle *AH, ParallelState *pstate)
+{
+       int                     work_status;
+
+       if (!pstate || pstate->numWorkers == 1)
+               return;
+
+       /* Waiting for the remaining worker processes to finish */
+       while (!IsEveryWorkerIdle(pstate))
+       {
+               if (ReapWorkerStatus(pstate, &work_status) == NO_SLOT)
+                       ListenToWorkers(AH, pstate, true);
+               else if (work_status != 0)
+                       exit_horribly(modulename,
+                                                 "Error processing a parallel work item\n");
+       }
+}
+
+/*
+ * This function is executed in the worker process.
+ *
+ * It returns the next message on the communication channel, blocking until it
+ * becomes available.
+ */
+static char *
+getMessageFromMaster(int pipefd[2])
+{
+       return readMessageFromPipe(pipefd[PIPE_READ]);
+}
+
+/*
+ * This function is executed in the worker process.
+ *
+ * It sends a message to the master on the communication channel.
+ */
+static void
+sendMessageToMaster(int pipefd[2], const char *str)
+{
+       int                     len = strlen(str) + 1;
+
+       if (pipewrite(pipefd[PIPE_WRITE], str, len) != len)
+               exit_horribly(modulename,
+                                         "Error writing to the communication channel: %s\n",
+                                         strerror(errno));
+}
+
+/*
+ * A select loop that repeats calling select until a descriptor in the read
+ * set becomes readable. On Windows we have to check for the termination event
+ * from time to time, on Unix we can just block forever.
+ */
+static int
+select_loop(int maxFd, fd_set *workerset)
+{
+       int                     i;
+       fd_set          saveSet = *workerset;
+
+#ifdef WIN32
+       /* should always be the master */
+       Assert(tMasterThreadId == GetCurrentThreadId());
+
+       for (;;)
+       {
+               /*
+                * sleep a quarter of a second before checking if we should terminate.
+                */
+               struct timeval tv = {0, 250000};
+
+               *workerset = saveSet;
+               i = select(maxFd + 1, workerset, NULL, NULL, &tv);
+
+               if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR)
+                       continue;
+               if (i)
+                       break;
+       }
+
+#else                                                  /* UNIX */
+
+       for (;;)
+       {
+               *workerset = saveSet;
+               i = select(maxFd + 1, workerset, NULL, NULL, NULL);
+
+               /*
+                * If we Ctrl-C the master process , it's likely that we interrupt
+                * select() here. The signal handler will set wantAbort == true and
+                * the shutdown journey starts from here. Note that we'll come back
+                * here later when we tell all workers to terminate and read their
+                * responses. But then we have aborting set to true.
+                */
+               if (wantAbort && !aborting)
+                       exit_horribly(modulename, "terminated by user\n");
+
+               if (i < 0 && errno == EINTR)
+                       continue;
+               break;
+       }
+
+#endif
+
+       return i;
+}
+
+
+/*
+ * This function is executed in the master process.
+ *
+ * It returns the next message from the worker on the communication channel,
+ * optionally blocking (do_wait) until it becomes available.
+ *
+ * The id of the worker is returned in *worker.
+ */
+static char *
+getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
+{
+       int                     i;
+       fd_set          workerset;
+       int                     maxFd = -1;
+       struct timeval nowait = {0, 0};
+
+       FD_ZERO(&workerset);
+
+       for (i = 0; i < pstate->numWorkers; i++)
+       {
+               if (pstate->parallelSlot[i].workerStatus == WRKR_TERMINATED)
+                       continue;
+               FD_SET(pstate->parallelSlot[i].pipeRead, &workerset);
+               /* actually WIN32 ignores the first parameter to select()... */
+               if (pstate->parallelSlot[i].pipeRead > maxFd)
+                       maxFd = pstate->parallelSlot[i].pipeRead;
+       }
+
+       if (do_wait)
+       {
+               i = select_loop(maxFd, &workerset);
+               Assert(i != 0);
+       }
+       else
+       {
+               if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0)
+                       return NULL;
+       }
+
+       if (i < 0)
+               exit_horribly(modulename, "Error in ListenToWorkers(): %s", strerror(errno));
+
+       for (i = 0; i < pstate->numWorkers; i++)
+       {
+               char       *msg;
+
+               if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset))
+                       continue;
+
+               msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead);
+               *worker = i;
+               return msg;
+       }
+       Assert(false);
+       return NULL;
+}
+
+/*
+ * This function is executed in the master process.
+ *
+ * It sends a message to a certain worker on the communication channel.
+ */
+static void
+sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
+{
+       int                     len = strlen(str) + 1;
+
+       if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len)
+       {
+               /*
+                * If we're already aborting anyway, don't care if we succeed or not.
+                * The child might have gone already.
+                */
+#ifndef WIN32
+               if (!aborting)
+#endif
+                       exit_horribly(modulename,
+                                                 "Error writing to the communication channel: %s\n",
+                                                 strerror(errno));
+       }
+}
+
+/*
+ * The underlying function to read a message from the communication channel
+ * (fd) with optional blocking (do_wait).
+ */
+static char *
+readMessageFromPipe(int fd)
+{
+       char       *msg;
+       int                     msgsize,
+                               bufsize;
+       int                     ret;
+
+       /*
+        * The problem here is that we need to deal with several possibilites: we
+        * could receive only a partial message or several messages at once. The
+        * caller expects us to return exactly one message however.
+        *
+        * We could either read in as much as we can and keep track of what we
+        * delivered back to the caller or we just read byte by byte. Once we see
+        * (char) 0, we know that it's the message's end. This would be quite
+        * inefficient for more data but since we are reading only on the command
+        * channel, the performance loss does not seem worth the trouble of
+        * keeping internal states for different file descriptors.
+        */
+       bufsize = 64;                           /* could be any number */
+       msg = (char *) pg_malloc(bufsize);
+
+       msgsize = 0;
+       for (;;)
+       {
+               Assert(msgsize <= bufsize);
+               ret = piperead(fd, msg + msgsize, 1);
+
+               /* worker has closed the connection or another error happened */
+               if (ret <= 0)
+                       return NULL;
+
+               Assert(ret == 1);
+
+               if (msg[msgsize] == '\0')
+                       return msg;
+
+               msgsize++;
+               if (msgsize == bufsize)
+               {
+                       /* could be any number */
+                       bufsize += 16;
+                       msg = (char *) realloc(msg, bufsize);
+               }
+       }
+}
+
+#ifdef WIN32
+/*
+ * This is a replacement version of pipe for Win32 which allows returned
+ * handles to be used in select(). Note that read/write calls must be replaced
+ * with recv/send.
+ */
+static int
+pgpipe(int handles[2])
+{
+       SOCKET          s;
+       struct sockaddr_in serv_addr;
+       int                     len = sizeof(serv_addr);
+
+       handles[0] = handles[1] = INVALID_SOCKET;
+
+       if ((s = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET)
+       {
+               write_msg(modulename, "pgpipe could not create socket: %ui",
+                                 WSAGetLastError());
+               return -1;
+       }
+
+       memset((void *) &serv_addr, 0, sizeof(serv_addr));
+       serv_addr.sin_family = AF_INET;
+       serv_addr.sin_port = htons(0);
+       serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+       if (bind(s, (SOCKADDR *) & serv_addr, len) == SOCKET_ERROR)
+       {
+               write_msg(modulename, "pgpipe could not bind: %ui",
+                                 WSAGetLastError());
+               closesocket(s);
+               return -1;
+       }
+       if (listen(s, 1) == SOCKET_ERROR)
+       {
+               write_msg(modulename, "pgpipe could not listen: %ui",
+                                 WSAGetLastError());
+               closesocket(s);
+               return -1;
+       }
+       if (getsockname(s, (SOCKADDR *) & serv_addr, &len) == SOCKET_ERROR)
+       {
+               write_msg(modulename, "pgpipe could not getsockname: %ui",
+                                 WSAGetLastError());
+               closesocket(s);
+               return -1;
+       }
+       if ((handles[1] = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET)
+       {
+               write_msg(modulename, "pgpipe could not create socket 2: %ui",
+                                 WSAGetLastError());
+               closesocket(s);
+               return -1;
+       }
+
+       if (connect(handles[1], (SOCKADDR *) & serv_addr, len) == SOCKET_ERROR)
+       {
+               write_msg(modulename, "pgpipe could not connect socket: %ui",
+                                 WSAGetLastError());
+               closesocket(s);
+               return -1;
+       }
+       if ((handles[0] = accept(s, (SOCKADDR *) & serv_addr, &len)) == INVALID_SOCKET)
+       {
+               write_msg(modulename, "pgpipe could not accept socket: %ui",
+                                 WSAGetLastError());
+               closesocket(handles[1]);
+               handles[1] = INVALID_SOCKET;
+               closesocket(s);
+               return -1;
+       }
+       closesocket(s);
+       return 0;
+}
+
+static int
+piperead(int s, char *buf, int len)
+{
+       int                     ret = recv(s, buf, len, 0);
+
+       if (ret < 0 && WSAGetLastError() == WSAECONNRESET)
+               /* EOF on the pipe! (win32 socket based implementation) */
+               ret = 0;
+       return ret;
+}
+
+#endif
diff --git a/src/bin/pg_dump/parallel.h b/src/bin/pg_dump/parallel.h

new file mode 100644 (file)

index 0000000..858b2a0
--- /dev/null
+++ b/src/bin/pg_dump/parallel.h
@@ -0,0 +1,85 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.h
+ *
+ *     Parallel support header file for the pg_dump archiver
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *     The author is not responsible for loss or damages that may
+ *     result from its use.
+ *
+ * IDENTIFICATION
+ *             src/bin/pg_dump/parallel.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "pg_backup_db.h"
+
+struct _archiveHandle;
+struct _tocEntry;
+
+typedef enum
+{
+       WRKR_TERMINATED = 0,
+       WRKR_IDLE,
+       WRKR_WORKING,
+       WRKR_FINISHED
+}      T_WorkerStatus;
+
+typedef enum T_Action
+{
+       ACT_DUMP,
+       ACT_RESTORE,
+}      T_Action;
+
+/* Arguments needed for a worker process */
+typedef struct ParallelArgs
+{
+       struct _archiveHandle *AH;
+       struct _tocEntry *te;
+}      ParallelArgs;
+
+/* State for each parallel activity slot */
+typedef struct ParallelSlot
+{
+       ParallelArgs *args;
+       T_WorkerStatus workerStatus;
+       int                     status;
+       int                     pipeRead;
+       int                     pipeWrite;
+       int                     pipeRevRead;
+       int                     pipeRevWrite;
+#ifdef WIN32
+       uintptr_t       hThread;
+       unsigned int threadId;
+#else
+       pid_t           pid;
+#endif
+} ParallelSlot;
+
+#define NO_SLOT (-1)
+
+typedef struct ParallelState
+{
+       int                     numWorkers;
+       ParallelSlot *parallelSlot;
+} ParallelState;
+
+extern int     GetIdleWorker(ParallelState *pstate);
+extern bool IsEveryWorkerIdle(ParallelState *pstate);
+extern void ListenToWorkers(struct _archiveHandle * AH, ParallelState *pstate, bool do_wait);
+extern int     ReapWorkerStatus(ParallelState *pstate, int *status);
+extern void EnsureIdleWorker(struct _archiveHandle * AH, ParallelState *pstate);
+extern void EnsureWorkersFinished(struct _archiveHandle * AH, ParallelState *pstate);
+
+extern ParallelState *ParallelBackupStart(struct _archiveHandle * AH,
+                                       RestoreOptions *ropt);
+extern void DispatchJobForTocEntry(struct _archiveHandle * AH,
+                                          ParallelState *pstate,
+                                          struct _tocEntry * te, T_Action act);
+extern void ParallelBackupEnd(struct _archiveHandle * AH, ParallelState *pstate);
+
+extern void checkAborting(struct _archiveHandle * AH);
diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h

index 473670ddd3748a5e99f354143dc4eb2d568a55c2..b456f959692ba07c15302f183cc04849f18d1098 100644 (file)
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -82,9 +82,14 @@ struct Archive
         int                     minRemoteVersion;               /* allowable range */
         int                     maxRemoteVersion;
  
+       int                     numWorkers;             /* number of parallel processes */
+       char       *sync_snapshot_id;           /* sync snapshot id for parallel
+                                                                                * operation */
+
         /* info needed for string escaping */
         int                     encoding;               /* libpq code for client_encoding */
         bool            std_strings;    /* standard_conforming_strings */
+       char       *use_role;           /* Issue SET ROLE to this */
  
         /* error handling */
         bool            exit_on_error;  /* whether to exit on SQL errors... */
@@ -142,11 +147,12 @@ typedef struct _restoreOptions
         int                     suppressDumpWarnings;   /* Suppress output of WARNING entries
                                                                                  * to stderr */
         bool            single_txn;
-       int                     number_of_jobs;
  
         bool       *idWanted;           /* array showing which dump IDs to emit */
  } RestoreOptions;
  
+typedef void (*SetupWorkerPtr) (Archive *AH, RestoreOptions *ropt);
+
  /*
   * Main archiver interface.
   */
@@ -189,7 +195,8 @@ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt);
  
  /* Create a new archive */
  extern Archive *CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
-                         const int compression, ArchiveMode mode);
+                         const int compression, ArchiveMode mode,
+                         SetupWorkerPtr setupDumpWorker);
  
  /* The --list option */
  extern void PrintTOCSummary(Archive *AH, RestoreOptions *ropt);
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c

index 19d12788d9d0d202adbc116a30c81da6fcfa609e..3c2671bb2d5e050de3e90377facd005f2a18f0f9 100644 (file)
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -22,8 +22,10 @@
  
  #include "pg_backup_db.h"
  #include "dumputils.h"
+#include "parallel.h"
  
  #include <ctype.h>
+#include <fcntl.h>
  #include <unistd.h>
  #include <sys/stat.h>
  #include <sys/types.h>
@@ -35,72 +37,6 @@
  
  #include "libpq/libpq-fs.h"
  
-/*
- * Special exit values from worker children.  We reserve 0 for normal
- * success; 1 and other small values should be interpreted as crashes.
- */
-#define WORKER_CREATE_DONE             10
-#define WORKER_INHIBIT_DATA            11
-#define WORKER_IGNORED_ERRORS  12
-
-/*
- * Unix uses exit to return result from worker child, so function is void.
- * Windows thread result comes via function return.
- */
-#ifndef WIN32
-#define parallel_restore_result void
-#else
-#define parallel_restore_result DWORD
-#endif
-
-/* IDs for worker children are either PIDs or thread handles */
-#ifndef WIN32
-#define thandle pid_t
-#else
-#define thandle HANDLE
-#endif
-
-typedef struct ParallelStateEntry
-{
-#ifdef WIN32
-       unsigned int threadId;
-#else
-       pid_t           pid;
-#endif
-       ArchiveHandle *AH;
-} ParallelStateEntry;
-
-typedef struct ParallelState
-{
-       int                     numWorkers;
-       ParallelStateEntry *pse;
-} ParallelState;
-
-/* Arguments needed for a worker child */
-typedef struct _restore_args
-{
-       ArchiveHandle *AH;
-       TocEntry   *te;
-       ParallelStateEntry *pse;
-} RestoreArgs;
-
-/* State for each parallel activity slot */
-typedef struct _parallel_slot
-{
-       thandle         child_id;
-       RestoreArgs *args;
-} ParallelSlot;
-
-typedef struct ShutdownInformation
-{
-       ParallelState *pstate;
-       Archive    *AHX;
-} ShutdownInformation;
-
-static ShutdownInformation shutdown_info;
-
-#define NO_SLOT (-1)
-
  #define TEXT_DUMP_HEADER "--\n-- PostgreSQL database dump\n--\n\n"
  #define TEXT_DUMPALL_HEADER "--\n-- PostgreSQL database cluster dump\n--\n\n"
  
@@ -116,7 +52,7 @@ static const char *modulename = gettext_noop("archiver");
  
  
  static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt,
-                const int compression, ArchiveMode mode);
+        const int compression, ArchiveMode mode, SetupWorkerPtr setupWorkerPtr);
  static void _getObjectDescription(PQExpBuffer buf, TocEntry *te,
                                           ArchiveHandle *AH);
  static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool isData, bool acl_pass);
@@ -136,7 +72,6 @@ static bool _tocEntryIsACL(TocEntry *te);
  static void _disableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt);
  static void _enableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt);
  static void buildTocEntryArrays(ArchiveHandle *AH);
-static TocEntry *getTocEntryByDumpId(ArchiveHandle *AH, DumpId id);
  static void _moveBefore(ArchiveHandle *AH, TocEntry *pos, TocEntry *te);
  static int     _discoverArchiveFormat(ArchiveHandle *AH);
  
@@ -149,21 +84,19 @@ static void RestoreOutput(ArchiveHandle *AH, OutputContext savedContext);
  
  static int restore_toc_entry(ArchiveHandle *AH, TocEntry *te,
                                   RestoreOptions *ropt, bool is_parallel);
-static void restore_toc_entries_parallel(ArchiveHandle *AH);
-static thandle spawn_restore(RestoreArgs *args);
-static thandle reap_child(ParallelSlot *slots, int n_slots, int *work_status);
-static bool work_in_progress(ParallelSlot *slots, int n_slots);
-static int     get_next_slot(ParallelSlot *slots, int n_slots);
+static void restore_toc_entries_prefork(ArchiveHandle *AH);
+static void restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
+                                                        TocEntry *pending_list);
+static void restore_toc_entries_postfork(ArchiveHandle *AH, TocEntry *pending_list);
  static void par_list_header_init(TocEntry *l);
  static void par_list_append(TocEntry *l, TocEntry *te);
  static void par_list_remove(TocEntry *te);
  static TocEntry *get_next_work_item(ArchiveHandle *AH,
                                    TocEntry *ready_list,
-                                  ParallelSlot *slots, int n_slots);
-static parallel_restore_result parallel_restore(RestoreArgs *args);
+                                  ParallelState *pstate);
  static void mark_work_done(ArchiveHandle *AH, TocEntry *ready_list,
-                          thandle worker, int status,
-                          ParallelSlot *slots, int n_slots);
+                          int worker, int status,
+                          ParallelState *pstate);
  static void fix_dependencies(ArchiveHandle *AH);
  static bool has_lock_conflicts(TocEntry *te1, TocEntry *te2);
  static void repoint_table_dependencies(ArchiveHandle *AH);
@@ -172,14 +105,6 @@ static void reduce_dependencies(ArchiveHandle *AH, TocEntry *te,
                                         TocEntry *ready_list);
  static void mark_create_done(ArchiveHandle *AH, TocEntry *te);
  static void inhibit_data_for_failed_table(ArchiveHandle *AH, TocEntry *te);
-static ArchiveHandle *CloneArchive(ArchiveHandle *AH);
-static void DeCloneArchive(ArchiveHandle *AH);
-
-static void setProcessIdentifier(ParallelStateEntry *pse, ArchiveHandle *AH);
-static void unsetProcessIdentifier(ParallelStateEntry *pse);
-static ParallelStateEntry *GetMyPSEntry(ParallelState *pstate);
-static void archive_close_connection(int code, void *arg);
-
  
  /*
   *     Wrapper functions.
@@ -189,15 +114,28 @@ static void archive_close_connection(int code, void *arg);
   *
   */
  
+/*
+ * The dump worker setup needs lots of knowledge of the internals of pg_dump,
+ * so It's defined in pg_dump.c and passed into OpenArchive. The restore worker
+ * setup doesn't need to know anything much, so it's defined here.
+ */
+static void
+setupRestoreWorker(Archive *AHX, RestoreOptions *ropt)
+{
+       ArchiveHandle *AH = (ArchiveHandle *) AHX;
+
+       (AH->ReopenPtr) (AH);
+}
+
  
  /* Create a new archive */
  /* Public */
  Archive *
  CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
-                         const int compression, ArchiveMode mode)
+        const int compression, ArchiveMode mode, SetupWorkerPtr setupDumpWorker)
  
  {
-       ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression, mode);
+       ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression, mode, setupDumpWorker);
  
         return (Archive *) AH;
  }
@@ -207,7 +145,7 @@ CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
  Archive *
  OpenArchive(const char *FileSpec, const ArchiveFormat fmt)
  {
-       ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, archModeRead);
+       ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, archModeRead, setupRestoreWorker);
  
         return (Archive *) AH;
  }
@@ -311,7 +249,7 @@ RestoreArchive(Archive *AHX)
         /*
          * If we're going to do parallel restore, there are some restrictions.
          */
-       parallel_mode = (ropt->number_of_jobs > 1 && ropt->useDB);
+       parallel_mode = (AH->public.numWorkers > 1 && ropt->useDB);
         if (parallel_mode)
         {
                 /* We haven't got round to making this work for all archive formats */
@@ -499,7 +437,25 @@ RestoreArchive(Archive *AHX)
          * In parallel mode, turn control over to the parallel-restore logic.
          */
         if (parallel_mode)
-               restore_toc_entries_parallel(AH);
+       {
+               ParallelState *pstate;
+               TocEntry        pending_list;
+
+               par_list_header_init(&pending_list);
+
+               /* This runs PRE_DATA items and then disconnects from the database */
+               restore_toc_entries_prefork(AH);
+               Assert(AH->connection == NULL);
+
+               /* ParallelBackupStart() will actually fork the processes */
+               pstate = ParallelBackupStart(AH, ropt);
+               restore_toc_entries_parallel(AH, pstate, &pending_list);
+               ParallelBackupEnd(AH, pstate);
+
+               /* reconnect the master and see if we missed something */
+               restore_toc_entries_postfork(AH, &pending_list);
+               Assert(AH->connection != NULL);
+       }
         else
         {
                 for (te = AH->toc->next; te != AH->toc; te = te->next)
@@ -558,7 +514,7 @@ static int
  restore_toc_entry(ArchiveHandle *AH, TocEntry *te,
                                   RestoreOptions *ropt, bool is_parallel)
  {
-       int                     retval = 0;
+       int                     status = WORKER_OK;
         teReqs          reqs;
         bool            defnDumped;
  
@@ -611,7 +567,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te,
                                 if (ropt->noDataForFailedTables)
                                 {
                                         if (is_parallel)
-                                               retval = WORKER_INHIBIT_DATA;
+                                               status = WORKER_INHIBIT_DATA;
                                         else
                                                 inhibit_data_for_failed_table(AH, te);
                                 }
@@ -626,7 +582,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te,
                                  * just set the return value.
                                  */
                                 if (is_parallel)
-                                       retval = WORKER_CREATE_DONE;
+                                       status = WORKER_CREATE_DONE;
                                 else
                                         mark_create_done(AH, te);
                         }
@@ -744,7 +700,10 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te,
                 }
         }
  
-       return retval;
+       if (AH->public.n_errors > 0 && status == WORKER_OK)
+               status = WORKER_IGNORED_ERRORS;
+
+       return status;
  }
  
  /*
@@ -1634,7 +1593,7 @@ buildTocEntryArrays(ArchiveHandle *AH)
         }
  }
  
-static TocEntry *
+TocEntry *
  getTocEntryByDumpId(ArchiveHandle *AH, DumpId id)
  {
         /* build index arrays if we didn't already */
@@ -2018,7 +1977,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
   */
  static ArchiveHandle *
  _allocAH(const char *FileSpec, const ArchiveFormat fmt,
-                const int compression, ArchiveMode mode)
+         const int compression, ArchiveMode mode, SetupWorkerPtr setupWorkerPtr)
  {
         ArchiveHandle *AH;
  
@@ -2100,6 +2059,8 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt,
         }
  #endif
  
+       AH->SetupWorkerPtr = setupWorkerPtr;
+
         if (fmt == archUnknown)
                 AH->format = _discoverArchiveFormat(AH);
         else
@@ -2132,50 +2093,66 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt,
         return AH;
  }
  
-
  void
-WriteDataChunks(ArchiveHandle *AH)
+WriteDataChunks(ArchiveHandle *AH, ParallelState *pstate)
  {
         TocEntry   *te;
-       StartDataPtr startPtr;
-       EndDataPtr      endPtr;
  
         for (te = AH->toc->next; te != AH->toc; te = te->next)
         {
-               if (te->dataDumper != NULL && (te->reqs & REQ_DATA) != 0)
-               {
-                       AH->currToc = te;
-                       /* printf("Writing data for %d (%x)\n", te->id, te); */
-
-                       if (strcmp(te->desc, "BLOBS") == 0)
-                       {
-                               startPtr = AH->StartBlobsPtr;
-                               endPtr = AH->EndBlobsPtr;
-                       }
-                       else
-                       {
-                               startPtr = AH->StartDataPtr;
-                               endPtr = AH->EndDataPtr;
-                       }
+               if (!te->dataDumper)
+                       continue;
  
-                       if (startPtr != NULL)
-                               (*startPtr) (AH, te);
+               if ((te->reqs & REQ_DATA) == 0)
+                       continue;
  
+               if (pstate && pstate->numWorkers > 1)
+               {
                         /*
-                        * printf("Dumper arg for %d is %x\n", te->id, te->dataDumperArg);
+                        * If we are in a parallel backup, then we are always the master
+                        * process.
                          */
+                       EnsureIdleWorker(AH, pstate);
+                       Assert(GetIdleWorker(pstate) != NO_SLOT);
+                       DispatchJobForTocEntry(AH, pstate, te, ACT_DUMP);
+               }
+               else
+                       WriteDataChunksForTocEntry(AH, te);
+       }
+       EnsureWorkersFinished(AH, pstate);
+}
  
-                       /*
-                        * The user-provided DataDumper routine needs to call
-                        * AH->WriteData
-                        */
-                       (*te->dataDumper) ((Archive *) AH, te->dataDumperArg);
+void
+WriteDataChunksForTocEntry(ArchiveHandle *AH, TocEntry *te)
+{
+       StartDataPtr startPtr;
+       EndDataPtr      endPtr;
  
-                       if (endPtr != NULL)
-                               (*endPtr) (AH, te);
-                       AH->currToc = NULL;
-               }
+       AH->currToc = te;
+
+       if (strcmp(te->desc, "BLOBS") == 0)
+       {
+               startPtr = AH->StartBlobsPtr;
+               endPtr = AH->EndBlobsPtr;
         }
+       else
+       {
+               startPtr = AH->StartDataPtr;
+               endPtr = AH->EndDataPtr;
+       }
+
+       if (startPtr != NULL)
+               (*startPtr) (AH, te);
+
+       /*
+        * The user-provided DataDumper routine needs to call AH->WriteData
+        */
+       (*te->dataDumper) ((Archive *) AH, te->dataDumperArg);
+
+       if (endPtr != NULL)
+               (*endPtr) (AH, te);
+
+       AH->currToc = NULL;
  }
  
  void
@@ -2911,7 +2888,7 @@ _getObjectDescription(PQExpBuffer buf, TocEntry *te, ArchiveHandle *AH)
         const char *type = te->desc;
  
         /* Use ALTER TABLE for views and sequences */
-       if (strcmp(type, "VIEW") == 0 || strcmp(type, "SEQUENCE") == 0||
+       if (strcmp(type, "VIEW") == 0 || strcmp(type, "SEQUENCE") == 0 ||
                 strcmp(type, "MATERIALIZED VIEW") == 0)
                 type = "TABLE";
  
@@ -3404,67 +3381,6 @@ dumpTimestamp(ArchiveHandle *AH, const char *msg, time_t tim)
                 ahprintf(AH, "-- %s %s\n\n", msg, buf);
  }
  
-static void
-setProcessIdentifier(ParallelStateEntry *pse, ArchiveHandle *AH)
-{
-#ifdef WIN32
-       pse->threadId = GetCurrentThreadId();
-#else
-       pse->pid = getpid();
-#endif
-       pse->AH = AH;
-}
-
-static void
-unsetProcessIdentifier(ParallelStateEntry *pse)
-{
-#ifdef WIN32
-       pse->threadId = 0;
-#else
-       pse->pid = 0;
-#endif
-       pse->AH = NULL;
-}
-
-static ParallelStateEntry *
-GetMyPSEntry(ParallelState *pstate)
-{
-       int                     i;
-
-       for (i = 0; i < pstate->numWorkers; i++)
-#ifdef WIN32
-               if (pstate->pse[i].threadId == GetCurrentThreadId())
-#else
-               if (pstate->pse[i].pid == getpid())
-#endif
-                       return &(pstate->pse[i]);
-
-       return NULL;
-}
-
-static void
-archive_close_connection(int code, void *arg)
-{
-       ShutdownInformation *si = (ShutdownInformation *) arg;
-
-       if (si->pstate)
-       {
-               ParallelStateEntry *entry = GetMyPSEntry(si->pstate);
-
-               if (entry != NULL && entry->AH)
-                       DisconnectDatabase(&(entry->AH->public));
-       }
-       else if (si->AHX)
-               DisconnectDatabase(si->AHX);
-}
-
-void
-on_exit_close_archive(Archive *AHX)
-{
-       shutdown_info.AHX = AHX;
-       on_exit_nicely(archive_close_connection, &shutdown_info);
-}
-
  /*
   * Main engine for parallel restore.
   *
@@ -3477,30 +3393,13 @@ on_exit_close_archive(Archive *AHX)
   * RestoreArchive).
   */
  static void
-restore_toc_entries_parallel(ArchiveHandle *AH)
+restore_toc_entries_prefork(ArchiveHandle *AH)
  {
         RestoreOptions *ropt = AH->ropt;
-       int                     n_slots = ropt->number_of_jobs;
-       ParallelSlot *slots;
-       int                     work_status;
-       int                     next_slot;
         bool            skipped_some;
-       TocEntry        pending_list;
-       TocEntry        ready_list;
         TocEntry   *next_work_item;
-       thandle         ret_child;
-       TocEntry   *te;
-       ParallelState *pstate;
-       int                     i;
-
-       ahlog(AH, 2, "entering restore_toc_entries_parallel\n");
  
-       slots = (ParallelSlot *) pg_malloc0(n_slots * sizeof(ParallelSlot));
-       pstate = (ParallelState *) pg_malloc(sizeof(ParallelState));
-       pstate->pse = (ParallelStateEntry *) pg_malloc0(n_slots * sizeof(ParallelStateEntry));
-       pstate->numWorkers = ropt->number_of_jobs;
-       for (i = 0; i < pstate->numWorkers; i++)
-               unsetProcessIdentifier(&(pstate->pse[i]));
+       ahlog(AH, 2, "entering restore_toc_entries_prefork\n");
  
         /* Adjust dependency information */
         fix_dependencies(AH);
@@ -3509,7 +3408,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
          * Do all the early stuff in a single connection in the parent. There's no
          * great point in running it in parallel, in fact it will actually run
          * faster in a single connection because we avoid all the connection and
-        * setup overhead.  Also, pre-9.2 pg_dump versions were not very good
+        * setup overhead.      Also, pre-9.2 pg_dump versions were not very good
          * about showing all the dependencies of SECTION_PRE_DATA items, so we do
          * not risk trying to process them out-of-order.
          *
@@ -3561,12 +3460,6 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
          */
         DisconnectDatabase(&AH->public);
  
-       /*
-        * Set the pstate in the shutdown_info. The exit handler uses pstate if
-        * set and falls back to AHX otherwise.
-        */
-       shutdown_info.pstate = pstate;
-
         /* blow away any transient state from the old connection */
         if (AH->currUser)
                 free(AH->currUser);
@@ -3578,17 +3471,42 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
                 free(AH->currTablespace);
         AH->currTablespace = NULL;
         AH->currWithOids = -1;
+}
+
+/*
+ * Main engine for parallel restore.
+ *
+ * Work is done in three phases.
+ * First we process all SECTION_PRE_DATA tocEntries, in a single connection,
+ * just as for a standard restore. This is done in restore_toc_entries_prefork().
+ * Second we process the remaining non-ACL steps in parallel worker children
+ * (threads on Windows, processes on Unix), these fork off and set up their
+ * connections before we call restore_toc_entries_parallel_forked.
+ * Finally we process all the ACL entries in a single connection (that happens
+ * back in RestoreArchive).
+ */
+static void
+restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
+                                                        TocEntry *pending_list)
+{
+       int                     work_status;
+       bool            skipped_some;
+       TocEntry        ready_list;
+       TocEntry   *next_work_item;
+       int                     ret_child;
+
+       ahlog(AH, 2, "entering restore_toc_entries_parallel\n");
  
         /*
-        * Initialize the lists of pending and ready items.  After this setup, the
-        * pending list is everything that needs to be done but is blocked by one
-        * or more dependencies, while the ready list contains items that have no
-        * remaining dependencies.      Note: we don't yet filter out entries that
-        * aren't going to be restored.  They might participate in dependency
+        * Initialize the lists of ready items, the list for pending items has
+        * already been initialized in the caller.      After this setup, the pending
+        * list is everything that needs to be done but is blocked by one or more
+        * dependencies, while the ready list contains items that have no
+        * remaining dependencies. Note: we don't yet filter out entries that
+        * aren't going to be restored. They might participate in dependency
          * chains connecting entries that should be restored, so we treat them as
          * live until we actually process them.
          */
-       par_list_header_init(&pending_list);
         par_list_header_init(&ready_list);
         skipped_some = false;
         for (next_work_item = AH->toc->next; next_work_item != AH->toc; next_work_item = next_work_item->next)
@@ -3613,7 +3531,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
                 }
  
                 if (next_work_item->depCount > 0)
-                       par_list_append(&pending_list, next_work_item);
+                       par_list_append(pending_list, next_work_item);
                 else
                         par_list_append(&ready_list, next_work_item);
         }
@@ -3627,9 +3545,8 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
  
         ahlog(AH, 1, "entering main parallel loop\n");
  
-       while ((next_work_item = get_next_work_item(AH, &ready_list,
-                                                                                               slots, n_slots)) != NULL ||
-                  work_in_progress(slots, n_slots))
+       while ((next_work_item = get_next_work_item(AH, &ready_list, pstate)) != NULL ||
+                  !IsEveryWorkerIdle(pstate))
         {
                 if (next_work_item != NULL)
                 {
@@ -3647,62 +3564,72 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
                                 continue;
                         }
  
-                       if ((next_slot = get_next_slot(slots, n_slots)) != NO_SLOT)
-                       {
-                               /* There is work still to do and a worker slot available */
-                               thandle         child;
-                               RestoreArgs *args;
+                       ahlog(AH, 1, "launching item %d %s %s\n",
+                                 next_work_item->dumpId,
+                                 next_work_item->desc, next_work_item->tag);
  
-                               ahlog(AH, 1, "launching item %d %s %s\n",
-                                         next_work_item->dumpId,
-                                         next_work_item->desc, next_work_item->tag);
+                       par_list_remove(next_work_item);
  
-                               par_list_remove(next_work_item);
-
-                               /* this memory is dealloced in mark_work_done() */
-                               args = pg_malloc(sizeof(RestoreArgs));
-                               args->AH = CloneArchive(AH);
-                               args->te = next_work_item;
-                               args->pse = &pstate->pse[next_slot];
+                       Assert(GetIdleWorker(pstate) != NO_SLOT);
+                       DispatchJobForTocEntry(AH, pstate, next_work_item, ACT_RESTORE);
+               }
+               else
+                       /* at least one child is working and we have nothing ready. */
+                       Assert(!IsEveryWorkerIdle(pstate));
  
-                               /* run the step in a worker child */
-                               child = spawn_restore(args);
+               for (;;)
+               {
+                       int                     nTerm = 0;
  
-                               slots[next_slot].child_id = child;
-                               slots[next_slot].args = args;
+                       /*
+                        * In order to reduce dependencies as soon as possible and
+                        * especially to reap the status of workers who are working on
+                        * items that pending items depend on, we do a non-blocking check
+                        * for ended workers first.
+                        *
+                        * However, if we do not have any other work items currently that
+                        * workers can work on, we do not busy-loop here but instead
+                        * really wait for at least one worker to terminate. Hence we call
+                        * ListenToWorkers(..., ..., do_wait = true) in this case.
+                        */
+                       ListenToWorkers(AH, pstate, !next_work_item);
  
-                               continue;
+                       while ((ret_child = ReapWorkerStatus(pstate, &work_status)) != NO_SLOT)
+                       {
+                               nTerm++;
+                               mark_work_done(AH, &ready_list, ret_child, work_status, pstate);
                         }
-               }
  
-               /*
-                * If we get here there must be work being done.  Either there is no
-                * work available to schedule (and work_in_progress returned true) or
-                * there are no slots available.  So we wait for a worker to finish,
-                * and process the result.
-                */
-               ret_child = reap_child(slots, n_slots, &work_status);
+                       /*
+                        * We need to make sure that we have an idle worker before
+                        * re-running the loop. If nTerm > 0 we already have that (quick
+                        * check).
+                        */
+                       if (nTerm > 0)
+                               break;
  
-               if (WIFEXITED(work_status))
-               {
-                       mark_work_done(AH, &ready_list,
-                                                  ret_child, WEXITSTATUS(work_status),
-                                                  slots, n_slots);
-               }
-               else
-               {
-                       exit_horribly(modulename, "worker process crashed: status %d\n",
-                                                 work_status);
+                       /* if nobody terminated, explicitly check for an idle worker */
+                       if (GetIdleWorker(pstate) != NO_SLOT)
+                               break;
+
+                       /*
+                        * If we have no idle worker, read the result of one or more
+                        * workers and loop the loop to call ReapWorkerStatus() on them.
+                        */
+                       ListenToWorkers(AH, pstate, true);
                 }
         }
  
         ahlog(AH, 1, "finished main parallel loop\n");
+}
  
-       /*
-        * Remove the pstate again, so the exit handler will now fall back to
-        * closing AH->connection again.
-        */
-       shutdown_info.pstate = NULL;
+static void
+restore_toc_entries_postfork(ArchiveHandle *AH, TocEntry *pending_list)
+{
+       RestoreOptions *ropt = AH->ropt;
+       TocEntry   *te;
+
+       ahlog(AH, 2, "entering restore_toc_entries_postfork\n");
  
         /*
          * Now reconnect the single parent connection.
@@ -3718,7 +3645,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
          * dependencies, or some other pathological condition. If so, do it in the
          * single parent connection.
          */
-       for (te = pending_list.par_next; te != &pending_list; te = te->par_next)
+       for (te = pending_list->par_next; te != pending_list; te = te->par_next)
         {
                 ahlog(AH, 1, "processing missed item %d %s %s\n",
                           te->dumpId, te->desc, te->tag);
@@ -3728,121 +3655,6 @@ restore_toc_entries_parallel(ArchiveHandle *AH)
         /* The ACLs will be handled back in RestoreArchive. */
  }
  
-/*
- * create a worker child to perform a restore step in parallel
- */
-static thandle
-spawn_restore(RestoreArgs *args)
-{
-       thandle         child;
-
-       /* Ensure stdio state is quiesced before forking */
-       fflush(NULL);
-
-#ifndef WIN32
-       child = fork();
-       if (child == 0)
-       {
-               /* in child process */
-               parallel_restore(args);
-               exit_horribly(modulename,
-                                         "parallel_restore should not return\n");
-       }
-       else if (child < 0)
-       {
-               /* fork failed */
-               exit_horribly(modulename,
-                                         "could not create worker process: %s\n",
-                                         strerror(errno));
-       }
-#else
-       child = (HANDLE) _beginthreadex(NULL, 0, (void *) parallel_restore,
-                                                                       args, 0, NULL);
-       if (child == 0)
-               exit_horribly(modulename,
-                                         "could not create worker thread: %s\n",
-                                         strerror(errno));
-#endif
-
-       return child;
-}
-
-/*
- *     collect status from a completed worker child
- */
-static thandle
-reap_child(ParallelSlot *slots, int n_slots, int *work_status)
-{
-#ifndef WIN32
-       /* Unix is so much easier ... */
-       return wait(work_status);
-#else
-       static HANDLE *handles = NULL;
-       int                     hindex,
-                               snum,
-                               tnum;
-       thandle         ret_child;
-       DWORD           res;
-
-       /* first time around only, make space for handles to listen on */
-       if (handles == NULL)
-               handles = (HANDLE *) pg_malloc0(n_slots * sizeof(HANDLE));
-
-       /* set up list of handles to listen to */
-       for (snum = 0, tnum = 0; snum < n_slots; snum++)
-               if (slots[snum].child_id != 0)
-                       handles[tnum++] = slots[snum].child_id;
-
-       /* wait for one to finish */
-       hindex = WaitForMultipleObjects(tnum, handles, false, INFINITE);
-
-       /* get handle of finished thread */
-       ret_child = handles[hindex - WAIT_OBJECT_0];
-
-       /* get the result */
-       GetExitCodeThread(ret_child, &res);
-       *work_status = res;
-
-       /* dispose of handle to stop leaks */
-       CloseHandle(ret_child);
-
-       return ret_child;
-#endif
-}
-
-/*
- * are we doing anything now?
- */
-static bool
-work_in_progress(ParallelSlot *slots, int n_slots)
-{
-       int                     i;
-
-       for (i = 0; i < n_slots; i++)
-       {
-               if (slots[i].child_id != 0)
-                       return true;
-       }
-       return false;
-}
-
-/*
- * find the first free parallel slot (if any).
- */
-static int
-get_next_slot(ParallelSlot *slots, int n_slots)
-{
-       int                     i;
-
-       for (i = 0; i < n_slots; i++)
-       {
-               if (slots[i].child_id == 0)
-                       return i;
-       }
-       return NO_SLOT;
-}
-
-
  /*
   * Check if te1 has an exclusive lock requirement for an item that te2 also
   * requires, whether or not te2's requirement is for an exclusive lock.
@@ -3916,7 +3728,7 @@ par_list_remove(TocEntry *te)
   */
  static TocEntry *
  get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
-                                  ParallelSlot *slots, int n_slots)
+                                  ParallelState *pstate)
  {
         bool            pref_non_data = false;  /* or get from AH->ropt */
         TocEntry   *data_te = NULL;
@@ -3931,11 +3743,11 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
         {
                 int                     count = 0;
  
-               for (k = 0; k < n_slots; k++)
-                       if (slots[k].args->te != NULL &&
-                               slots[k].args->te->section == SECTION_DATA)
+               for (k = 0; k < pstate->numWorkers; k++)
+                       if (pstate->parallelSlot[k].args->te != NULL &&
+                               pstate->parallelSlot[k].args->te->section == SECTION_DATA)
                                 count++;
-               if (n_slots == 0 || count * 4 < n_slots)
+               if (pstate->numWorkers == 0 || count * 4 < pstate->numWorkers)
                         pref_non_data = false;
         }
  
@@ -3951,13 +3763,13 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
                  * that a currently running item also needs lock on, or vice versa. If
                  * so, we don't want to schedule them together.
                  */
-               for (i = 0; i < n_slots && !conflicts; i++)
+               for (i = 0; i < pstate->numWorkers && !conflicts; i++)
                 {
                         TocEntry   *running_te;
  
-                       if (slots[i].args == NULL)
+                       if (pstate->parallelSlot[i].workerStatus != WRKR_WORKING)
                                 continue;
-                       running_te = slots[i].args->te;
+                       running_te = pstate->parallelSlot[i].args->te;
  
                         if (has_lock_conflicts(te, running_te) ||
                                 has_lock_conflicts(running_te, te))
@@ -3992,63 +3804,29 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
  /*
   * Restore a single TOC item in parallel with others
   *
- * this is the procedure run as a thread (Windows) or a
- * separate process (everything else).
+ * this is run in the worker, i.e. in a thread (Windows) or a separate process
+ * (everything else). A worker process executes several such work items during
+ * a parallel backup or restore. Once we terminate here and report back that
+ * our work is finished, the master process will assign us a new work item.
   */
-static parallel_restore_result
-parallel_restore(RestoreArgs *args)
+int
+parallel_restore(ParallelArgs * args)
  {
         ArchiveHandle *AH = args->AH;
         TocEntry   *te = args->te;
         RestoreOptions *ropt = AH->ropt;
-       int                     retval;
-
-       setProcessIdentifier(args->pse, AH);
-
-       /*
-        * Close and reopen the input file so we have a private file pointer that
-        * doesn't stomp on anyone else's file pointer, if we're actually going to
-        * need to read from the file. Otherwise, just close it except on Windows,
-        * where it will possibly be needed by other threads.
-        *
-        * Note: on Windows, since we are using threads not processes, the reopen
-        * call *doesn't* close the original file pointer but just open a new one.
-        */
-       if (te->section == SECTION_DATA)
-               (AH->ReopenPtr) (AH);
-#ifndef WIN32
-       else
-               (AH->ClosePtr) (AH);
-#endif
-
-       /*
-        * We need our own database connection, too
-        */
-       ConnectDatabase((Archive *) AH, ropt->dbname,
-                                       ropt->pghost, ropt->pgport, ropt->username,
-                                       ropt->promptPassword);
+       int                     status;
  
         _doSetFixedOutputState(AH);
  
-       /* Restore the TOC item */
-       retval = restore_toc_entry(AH, te, ropt, true);
-
-       /* And clean up */
-       DisconnectDatabase((Archive *) AH);
-       unsetProcessIdentifier(args->pse);
+       Assert(AH->connection != NULL);
  
-       /* If we reopened the file, we are done with it, so close it now */
-       if (te->section == SECTION_DATA)
-               (AH->ClosePtr) (AH);
+       AH->public.n_errors = 0;
  
-       if (retval == 0 && AH->public.n_errors)
-               retval = WORKER_IGNORED_ERRORS;
+       /* Restore the TOC item */
+       status = restore_toc_entry(AH, te, ropt, true);
  
-#ifndef WIN32
-       exit(retval);
-#else
-       return retval;
-#endif
+       return status;
  }
  
  
@@ -4060,25 +3838,12 @@ parallel_restore(RestoreArgs *args)
   */
  static void
  mark_work_done(ArchiveHandle *AH, TocEntry *ready_list,
-                          thandle worker, int status,
-                          ParallelSlot *slots, int n_slots)
+                          int worker, int status,
+                          ParallelState *pstate)
  {
         TocEntry   *te = NULL;
-       int                     i;
-
-       for (i = 0; i < n_slots; i++)
-       {
-               if (slots[i].child_id == worker)
-               {
-                       slots[i].child_id = 0;
-                       te = slots[i].args->te;
-                       DeCloneArchive(slots[i].args->AH);
-                       free(slots[i].args);
-                       slots[i].args = NULL;
  
-                       break;
-               }
-       }
+       te = pstate->parallelSlot[worker].args->te;
  
         if (te == NULL)
                 exit_horribly(modulename, "could not find slot of finished worker\n");
@@ -4179,8 +3944,8 @@ fix_dependencies(ArchiveHandle *AH)
         /*
          * Count the incoming dependencies for each item.  Also, it is possible
          * that the dependencies list items that are not in the archive at all
-        * (that should not happen in 9.2 and later, but is highly likely in
-        * older archives).  Subtract such items from the depCounts.
+        * (that should not happen in 9.2 and later, but is highly likely in older
+        * archives).  Subtract such items from the depCounts.
          */
         for (te = AH->toc->next; te != AH->toc; te = te->next)
         {
@@ -4377,16 +4142,13 @@ inhibit_data_for_failed_table(ArchiveHandle *AH, TocEntry *te)
         }
  }
  
-
  /*
   * Clone and de-clone routines used in parallel restoration.
   *
   * Enough of the structure is cloned to ensure that there is no
   * conflict between different threads each with their own clone.
- *
- * These could be public, but no need at present.
   */
-static ArchiveHandle *
+ArchiveHandle *
  CloneArchive(ArchiveHandle *AH)
  {
         ArchiveHandle *clone;
@@ -4412,9 +4174,60 @@ CloneArchive(ArchiveHandle *AH)
         /* clone has its own error count, too */
         clone->public.n_errors = 0;
  
+       /*
+        * Connect our new clone object to the database: In parallel restore the
+        * parent is already disconnected, because we can connect the worker
+        * processes independently to the database (no snapshot sync required). In
+        * parallel backup we clone the parent's existing connection.
+        */
+       if (AH->mode == archModeRead)
+       {
+               RestoreOptions *ropt = AH->ropt;
+
+               Assert(AH->connection == NULL);
+               /* this also sets clone->connection */
+               ConnectDatabase((Archive *) clone, ropt->dbname,
+                                               ropt->pghost, ropt->pgport, ropt->username,
+                                               ropt->promptPassword);
+       }
+       else
+       {
+               char       *dbname;
+               char       *pghost;
+               char       *pgport;
+               char       *username;
+               const char *encname;
+
+               Assert(AH->connection != NULL);
+
+               /*
+                * Even though we are technically accessing the parent's database
+                * object here, these functions are fine to be called like that
+                * because all just return a pointer and do not actually send/receive
+                * any data to/from the database.
+                */
+               dbname = PQdb(AH->connection);
+               pghost = PQhost(AH->connection);
+               pgport = PQport(AH->connection);
+               username = PQuser(AH->connection);
+               encname = pg_encoding_to_char(AH->public.encoding);
+
+               /* this also sets clone->connection */
+               ConnectDatabase((Archive *) clone, dbname, pghost, pgport, username, TRI_NO);
+
+               /*
+                * Set the same encoding, whatever we set here is what we got from
+                * pg_encoding_to_char(), so we really shouldn't run into an error
+                * setting that very same value. Also see the comment in
+                * SetupConnection().
+                */
+               PQsetClientEncoding(clone->connection, encname);
+       }
+
         /* Let the format-specific code have a chance too */
         (clone->ClonePtr) (clone);
  
+       Assert(clone->connection != NULL);
         return clone;
  }
  
@@ -4423,7 +4236,7 @@ CloneArchive(ArchiveHandle *AH)
   *
   * Note: we assume any clone-local connection was already closed.
   */
-static void
+void
  DeCloneArchive(ArchiveHandle *AH)
  {
         /* Clear format-specific state */
diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h

index 8859bd9776e33a7943c6b5f00951033a4f035371..2f9434efbccf31b9e9b5490d0cb151fa7b7b9eee 100644 (file)
--- a/src/bin/pg_dump/pg_backup_archiver.h
+++ b/src/bin/pg_dump/pg_backup_archiver.h
@@ -100,8 +100,21 @@ typedef z_stream *z_streamp;
  #define K_OFFSET_POS_SET 2
  #define K_OFFSET_NO_DATA 3
  
+/*
+ * Special exit values from worker children.  We reserve 0 for normal
+ * success; 1 and other small values should be interpreted as crashes.
+ */
+#define WORKER_OK                                        0
+#define WORKER_CREATE_DONE                       10
+#define WORKER_INHIBIT_DATA                      11
+#define WORKER_IGNORED_ERRORS            12
+
  struct _archiveHandle;
  struct _tocEntry;
+struct _restoreList;
+struct ParallelArgs;
+struct ParallelState;
+enum T_Action;
  
  typedef void (*ClosePtr) (struct _archiveHandle * AH);
  typedef void (*ReopenPtr) (struct _archiveHandle * AH);
@@ -129,6 +142,13 @@ typedef void (*PrintTocDataPtr) (struct _archiveHandle * AH, struct _tocEntry *
  typedef void (*ClonePtr) (struct _archiveHandle * AH);
  typedef void (*DeClonePtr) (struct _archiveHandle * AH);
  
+typedef char *(*WorkerJobRestorePtr) (struct _archiveHandle * AH, struct _tocEntry * te);
+typedef char *(*WorkerJobDumpPtr) (struct _archiveHandle * AH, struct _tocEntry * te);
+typedef char *(*MasterStartParallelItemPtr) (struct _archiveHandle * AH, struct _tocEntry * te,
+                                                                                                                enum T_Action act);
+typedef int (*MasterEndParallelItemPtr) (struct _archiveHandle * AH, struct _tocEntry * te,
+                                                                                const char *str, enum T_Action act);
+
  typedef size_t (*CustomOutPtr) (struct _archiveHandle * AH, const void *buf, size_t len);
  
  typedef enum
@@ -227,6 +247,13 @@ typedef struct _archiveHandle
         StartBlobPtr StartBlobPtr;
         EndBlobPtr EndBlobPtr;
  
+       MasterStartParallelItemPtr MasterStartParallelItemPtr;
+       MasterEndParallelItemPtr MasterEndParallelItemPtr;
+
+       SetupWorkerPtr SetupWorkerPtr;
+       WorkerJobDumpPtr WorkerJobDumpPtr;
+       WorkerJobRestorePtr WorkerJobRestorePtr;
+
         ClonePtr ClonePtr;                      /* Clone format-specific fields */
         DeClonePtr DeClonePtr;          /* Clean up cloned fields */
  
@@ -236,6 +263,7 @@ typedef struct _archiveHandle
         char       *archdbname;         /* DB name *read* from archive */
         enum trivalue promptPassword;
         char       *savedPassword;      /* password for ropt->username, if known */
+       char       *use_role;
         PGconn     *connection;
         int                     connectToDB;    /* Flag to indicate if direct DB connection is
                                                                  * required */
@@ -327,6 +355,7 @@ typedef struct _tocEntry
         int                     nLockDeps;              /* number of such dependencies */
  } TocEntry;
  
+extern int     parallel_restore(struct ParallelArgs * args);
  extern void on_exit_close_archive(Archive *AHX);
  
  extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *modulename, const char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
@@ -337,9 +366,13 @@ extern void WriteHead(ArchiveHandle *AH);
  extern void ReadHead(ArchiveHandle *AH);
  extern void WriteToc(ArchiveHandle *AH);
  extern void ReadToc(ArchiveHandle *AH);
-extern void WriteDataChunks(ArchiveHandle *AH);
+extern void WriteDataChunks(ArchiveHandle *AH, struct ParallelState *pstate);
+extern void WriteDataChunksForTocEntry(ArchiveHandle *AH, TocEntry *te);
+extern ArchiveHandle *CloneArchive(ArchiveHandle *AH);
+extern void DeCloneArchive(ArchiveHandle *AH);
  
  extern teReqs TocIDRequired(ArchiveHandle *AH, DumpId id);
+TocEntry   *getTocEntryByDumpId(ArchiveHandle *AH, DumpId id);
  extern bool checkSeek(FILE *fp);
  
  #define appendStringLiteralAHX(buf,str,AH) \
diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c

index 7081598baaaa63514beca620d81b8e21aa2412f1..c2e94ca084a3470cd31ccf83da8088ce9b40a472 100644 (file)
--- a/src/bin/pg_dump/pg_backup_custom.c
+++ b/src/bin/pg_dump/pg_backup_custom.c
@@ -26,6 +26,7 @@
  
  #include "compress_io.h"
  #include "dumputils.h"
+#include "parallel.h"
  
  /*--------
   * Routines in the format interface
@@ -59,6 +60,10 @@ static void _LoadBlobs(ArchiveHandle *AH, bool drop);
  static void _Clone(ArchiveHandle *AH);
  static void _DeClone(ArchiveHandle *AH);
  
+static char *_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act);
+static int     _MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act);
+char      *_WorkerJobRestoreCustom(ArchiveHandle *AH, TocEntry *te);
+
  typedef struct
  {
         CompressorState *cs;
@@ -127,6 +132,13 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
         AH->ClonePtr = _Clone;
         AH->DeClonePtr = _DeClone;
  
+       AH->MasterStartParallelItemPtr = _MasterStartParallelItem;
+       AH->MasterEndParallelItemPtr = _MasterEndParallelItem;
+
+       /* no parallel dump in the custom archive, only parallel restore */
+       AH->WorkerJobDumpPtr = NULL;
+       AH->WorkerJobRestorePtr = _WorkerJobRestoreCustom;
+
         /* Set up a private area. */
         ctx = (lclContext *) pg_malloc0(sizeof(lclContext));
         AH->formatData = (void *) ctx;
@@ -698,7 +710,7 @@ _CloseArchive(ArchiveHandle *AH)
                 tpos = ftello(AH->FH);
                 WriteToc(AH);
                 ctx->dataStart = _getFilePos(AH, ctx);
-               WriteDataChunks(AH);
+               WriteDataChunks(AH, NULL);
  
                 /*
                  * If possible, re-write the TOC in order to update the data offset
@@ -796,6 +808,80 @@ _DeClone(ArchiveHandle *AH)
         free(ctx);
  }
  
+/*
+ * This function is executed in the child of a parallel backup for the
+ * custom format archive and dumps the actual data.
+ */
+char *
+_WorkerJobRestoreCustom(ArchiveHandle *AH, TocEntry *te)
+{
+       /*
+        * short fixed-size string + some ID so far, this needs to be malloc'ed
+        * instead of static because we work with threads on windows
+        */
+       const int       buflen = 64;
+       char       *buf = (char *) pg_malloc(buflen);
+       ParallelArgs pargs;
+       int                     status;
+
+       pargs.AH = AH;
+       pargs.te = te;
+
+       status = parallel_restore(&pargs);
+
+       snprintf(buf, buflen, "OK RESTORE %d %d %d", te->dumpId, status,
+                        status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0);
+
+       return buf;
+}
+
+/*
+ * This function is executed in the parent process. Depending on the desired
+ * action (dump or restore) it creates a string that is understood by the
+ * _WorkerJobDump /_WorkerJobRestore functions of the dump format.
+ */
+static char *
+_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act)
+{
+       /*
+        * A static char is okay here, even on Windows because we call this
+        * function only from one process (the master).
+        */
+       static char buf[64];            /* short fixed-size string + number */
+
+       /* no parallel dump in the custom archive format */
+       Assert(act == ACT_RESTORE);
+
+       snprintf(buf, sizeof(buf), "RESTORE %d", te->dumpId);
+
+       return buf;
+}
+
+/*
+ * This function is executed in the parent process. It analyzes the response of
+ * the _WorkerJobDump / _WorkerJobRestore functions of the dump format.
+ */
+static int
+_MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act)
+{
+       DumpId          dumpId;
+       int                     nBytes,
+                               status,
+                               n_errors;
+
+       /* no parallel dump in the custom archive */
+       Assert(act == ACT_RESTORE);
+
+       sscanf(str, "%u %u %u%n", &dumpId, &status, &n_errors, &nBytes);
+
+       Assert(nBytes == strlen(str));
+       Assert(dumpId == te->dumpId);
+
+       AH->public.n_errors += n_errors;
+
+       return status;
+}
+
  /*--------------------------------------------------
   * END OF FORMAT CALLBACKS
   *--------------------------------------------------
diff --git a/src/bin/pg_dump/pg_backup_db.c b/src/bin/pg_dump/pg_backup_db.c

index 4c4f24f7d5abd46e72b33cf6ef068f3933a0f1b6..544d01a4ddc69b1678304e21ca1a6ff22d94647b 100644 (file)
--- a/src/bin/pg_dump/pg_backup_db.c
+++ b/src/bin/pg_dump/pg_backup_db.c
@@ -309,12 +309,30 @@ ConnectDatabase(Archive *AHX,
         PQsetNoticeProcessor(AH->connection, notice_processor, NULL);
  }
  
+/*
+ * Close the connection to the database and also cancel off the query if we
+ * have one running.
+ */
  void
  DisconnectDatabase(Archive *AHX)
  {
         ArchiveHandle *AH = (ArchiveHandle *) AHX;
+       PGcancel   *cancel;
+       char            errbuf[1];
+
+       if (!AH->connection)
+               return;
  
-       PQfinish(AH->connection);       /* noop if AH->connection is NULL */
+       if (PQtransactionStatus(AH->connection) == PQTRANS_ACTIVE)
+       {
+               if ((cancel = PQgetCancel(AH->connection)))
+               {
+                       PQcancel(cancel, errbuf, sizeof(errbuf));
+                       PQfreeCancel(cancel);
+               }
+       }
+
+       PQfinish(AH->connection);
         AH->connection = NULL;
  }
  
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c

index 5b71ebaeebc20bedc5185fc3bea0c9dd274a5863..66151f584b05a1a656c36a8df48c1709cc5419bc 100644 (file)
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -35,6 +35,7 @@
  
  #include "compress_io.h"
  #include "dumputils.h"
+#include "parallel.h"
  
  #include <dirent.h>
  #include <sys/stat.h>
@@ -50,6 +51,7 @@ typedef struct
         cfp                *dataFH;                     /* currently open data file */
  
         cfp                *blobsTocFH;         /* file handle for blobs.toc */
+       ParallelState *pstate;          /* for parallel backup / restore */
  } lclContext;
  
  typedef struct
@@ -70,6 +72,7 @@ static int    _ReadByte(ArchiveHandle *);
  static size_t _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len);
  static size_t _ReadBuf(ArchiveHandle *AH, void *buf, size_t len);
  static void _CloseArchive(ArchiveHandle *AH);
+static void _ReopenArchive(ArchiveHandle *AH);
  static void _PrintTocData(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt);
  
  static void _WriteExtraToc(ArchiveHandle *AH, TocEntry *te);
@@ -82,8 +85,17 @@ static void _EndBlob(ArchiveHandle *AH, TocEntry *te, Oid oid);
  static void _EndBlobs(ArchiveHandle *AH, TocEntry *te);
  static void _LoadBlobs(ArchiveHandle *AH, RestoreOptions *ropt);
  
-static char *prependDirectory(ArchiveHandle *AH, const char *relativeFilename);
+static void _Clone(ArchiveHandle *AH);
+static void _DeClone(ArchiveHandle *AH);
  
+static char *_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act);
+static int _MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te,
+                                          const char *str, T_Action act);
+static char *_WorkerJobRestoreDirectory(ArchiveHandle *AH, TocEntry *te);
+static char *_WorkerJobDumpDirectory(ArchiveHandle *AH, TocEntry *te);
+
+static void setFilePath(ArchiveHandle *AH, char *buf,
+                       const char *relativeFilename);
  
  /*
   *     Init routine required by ALL formats. This is a global routine
@@ -110,7 +122,7 @@ InitArchiveFmt_Directory(ArchiveHandle *AH)
         AH->WriteBufPtr = _WriteBuf;
         AH->ReadBufPtr = _ReadBuf;
         AH->ClosePtr = _CloseArchive;
-       AH->ReopenPtr = NULL;
+       AH->ReopenPtr = _ReopenArchive;
         AH->PrintTocDataPtr = _PrintTocData;
         AH->ReadExtraTocPtr = _ReadExtraToc;
         AH->WriteExtraTocPtr = _WriteExtraToc;
@@ -121,8 +133,14 @@ InitArchiveFmt_Directory(ArchiveHandle *AH)
         AH->EndBlobPtr = _EndBlob;
         AH->EndBlobsPtr = _EndBlobs;
  
-       AH->ClonePtr = NULL;
-       AH->DeClonePtr = NULL;
+       AH->ClonePtr = _Clone;
+       AH->DeClonePtr = _DeClone;
+
+       AH->WorkerJobRestorePtr = _WorkerJobRestoreDirectory;
+       AH->WorkerJobDumpPtr = _WorkerJobDumpDirectory;
+
+       AH->MasterStartParallelItemPtr = _MasterStartParallelItem;
+       AH->MasterEndParallelItemPtr = _MasterEndParallelItem;
  
         /* Set up our private context */
         ctx = (lclContext *) pg_malloc0(sizeof(lclContext));
@@ -146,16 +164,41 @@ InitArchiveFmt_Directory(ArchiveHandle *AH)
  
         if (AH->mode == archModeWrite)
         {
-               if (mkdir(ctx->directory, 0700) < 0)
+               struct stat st;
+               bool            is_empty = false;
+
+               /* we accept an empty existing directory */
+               if (stat(ctx->directory, &st) == 0 && S_ISDIR(st.st_mode))
+               {
+                       DIR                *dir = opendir(ctx->directory);
+
+                       if (dir)
+                       {
+                               struct dirent *d;
+
+                               is_empty = true;
+                               while ((d = readdir(dir)))
+                               {
+                                       if (strcmp(d->d_name, ".") != 0 && strcmp(d->d_name, "..") != 0)
+                                       {
+                                               is_empty = false;
+                                               break;
+                                       }
+                               }
+                               closedir(dir);
+                       }
+               }
+
+               if (!is_empty && mkdir(ctx->directory, 0700) < 0)
                         exit_horribly(modulename, "could not create directory \"%s\": %s\n",
                                                   ctx->directory, strerror(errno));
         }
         else
         {                                                       /* Read Mode */
-               char       *fname;
+               char            fname[MAXPGPATH];
                 cfp                *tocFH;
  
-               fname = prependDirectory(AH, "toc.dat");
+               setFilePath(AH, fname, "toc.dat");
  
                 tocFH = cfopen_read(fname, PG_BINARY_R);
                 if (tocFH == NULL)
@@ -281,9 +324,9 @@ _StartData(ArchiveHandle *AH, TocEntry *te)
  {
         lclTocEntry *tctx = (lclTocEntry *) te->formatData;
         lclContext *ctx = (lclContext *) AH->formatData;
-       char       *fname;
+       char            fname[MAXPGPATH];
  
-       fname = prependDirectory(AH, tctx->filename);
+       setFilePath(AH, fname, tctx->filename);
  
         ctx->dataFH = cfopen_write(fname, PG_BINARY_W, AH->compression);
         if (ctx->dataFH == NULL)
@@ -308,6 +351,9 @@ _WriteData(ArchiveHandle *AH, const void *data, size_t dLen)
         if (dLen == 0)
                 return 0;
  
+       /* Are we aborting? */
+       checkAborting(AH);
+
         return cfwrite(data, dLen, ctx->dataFH);
  }
  
@@ -375,8 +421,9 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt)
                 _LoadBlobs(AH, ropt);
         else
         {
-               char       *fname = prependDirectory(AH, tctx->filename);
+               char            fname[MAXPGPATH];
  
+               setFilePath(AH, fname, tctx->filename);
                 _PrintFileData(AH, fname, ropt);
         }
  }
@@ -386,12 +433,12 @@ _LoadBlobs(ArchiveHandle *AH, RestoreOptions *ropt)
  {
         Oid                     oid;
         lclContext *ctx = (lclContext *) AH->formatData;
-       char       *fname;
+       char            fname[MAXPGPATH];
         char            line[MAXPGPATH];
  
         StartRestoreBlobs(AH);
  
-       fname = prependDirectory(AH, "blobs.toc");
+       setFilePath(AH, fname, "blobs.toc");
  
         ctx->blobsTocFH = cfopen_read(fname, PG_BINARY_R);
  
@@ -474,6 +521,9 @@ _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len)
         lclContext *ctx = (lclContext *) AH->formatData;
         size_t          res;
  
+       /* Are we aborting? */
+       checkAborting(AH);
+
         res = cfwrite(buf, len, ctx->dataFH);
         if (res != len)
                 exit_horribly(modulename, "could not write to output file: %s\n",
@@ -518,7 +568,12 @@ _CloseArchive(ArchiveHandle *AH)
         if (AH->mode == archModeWrite)
         {
                 cfp                *tocFH;
-               char       *fname = prependDirectory(AH, "toc.dat");
+               char            fname[MAXPGPATH];
+
+               setFilePath(AH, fname, "toc.dat");
+
+               /* this will actually fork the processes for a parallel backup */
+               ctx->pstate = ParallelBackupStart(AH, NULL);
  
                 /* The TOC is always created uncompressed */
                 tocFH = cfopen_write(fname, PG_BINARY_W, 0);
@@ -539,11 +594,25 @@ _CloseArchive(ArchiveHandle *AH)
                 if (cfclose(tocFH) != 0)
                         exit_horribly(modulename, "could not close TOC file: %s\n",
                                                   strerror(errno));
-               WriteDataChunks(AH);
+               WriteDataChunks(AH, ctx->pstate);
+
+               ParallelBackupEnd(AH, ctx->pstate);
         }
         AH->FH = NULL;
  }
  
+/*
+ * Reopen the archive's file handle.
+ */
+static void
+_ReopenArchive(ArchiveHandle *AH)
+{
+       /*
+        * Our TOC is in memory, our data files are opened by each child anyway as
+        * they are separate. We support reopening the archive by just doing
+        * nothing.
+        */
+}
  
  /*
   * BLOB support
@@ -560,9 +629,9 @@ static void
  _StartBlobs(ArchiveHandle *AH, TocEntry *te)
  {
         lclContext *ctx = (lclContext *) AH->formatData;
-       char       *fname;
+       char            fname[MAXPGPATH];
  
-       fname = prependDirectory(AH, "blobs.toc");
+       setFilePath(AH, fname, "blobs.toc");
  
         /* The blob TOC file is never compressed */
         ctx->blobsTocFH = cfopen_write(fname, "ab", 0);
@@ -627,12 +696,16 @@ _EndBlobs(ArchiveHandle *AH, TocEntry *te)
         ctx->blobsTocFH = NULL;
  }
  
-
-static char *
-prependDirectory(ArchiveHandle *AH, const char *relativeFilename)
+/*
+ * Gets a relative file name and prepends the output directory, writing the
+ * result to buf. The caller needs to make sure that buf is MAXPGPATH bytes
+ * big. Can't use a static char[MAXPGPATH] inside the function because we run
+ * multithreaded on Windows.
+ */
+static void
+setFilePath(ArchiveHandle *AH, char *buf, const char *relativeFilename)
  {
         lclContext *ctx = (lclContext *) AH->formatData;
-       static char buf[MAXPGPATH];
         char       *dname;
  
         dname = ctx->directory;
@@ -643,6 +716,157 @@ prependDirectory(ArchiveHandle *AH, const char *relativeFilename)
         strcpy(buf, dname);
         strcat(buf, "/");
         strcat(buf, relativeFilename);
+}
+
+/*
+ * Clone format-specific fields during parallel restoration.
+ */
+static void
+_Clone(ArchiveHandle *AH)
+{
+       lclContext *ctx = (lclContext *) AH->formatData;
+
+       AH->formatData = (lclContext *) pg_malloc(sizeof(lclContext));
+       memcpy(AH->formatData, ctx, sizeof(lclContext));
+       ctx = (lclContext *) AH->formatData;
+
+       /*
+        * Note: we do not make a local lo_buf because we expect at most one BLOBS
+        * entry per archive, so no parallelism is possible.  Likewise,
+        * TOC-entry-local state isn't an issue because any one TOC entry is
+        * touched by just one worker child.
+        */
+
+       /*
+        * We also don't copy the ParallelState pointer (pstate), only the master
+        * process ever writes to it.
+        */
+}
+
+static void
+_DeClone(ArchiveHandle *AH)
+{
+       lclContext *ctx = (lclContext *) AH->formatData;
+
+       free(ctx);
+}
+
+/*
+ * This function is executed in the parent process. Depending on the desired
+ * action (dump or restore) it creates a string that is understood by the
+ * _WorkerJobDump /_WorkerJobRestore functions of the dump format.
+ */
+static char *
+_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act)
+{
+       /*
+        * A static char is okay here, even on Windows because we call this
+        * function only from one process (the master).
+        */
+       static char buf[64];
+
+       if (act == ACT_DUMP)
+               snprintf(buf, sizeof(buf), "DUMP %d", te->dumpId);
+       else if (act == ACT_RESTORE)
+               snprintf(buf, sizeof(buf), "RESTORE %d", te->dumpId);
+
+       return buf;
+}
+
+/*
+ * This function is executed in the child of a parallel backup for the
+ * directory archive and dumps the actual data.
+ *
+ * We are currently returning only the DumpId so theoretically we could
+ * make this function returning an int (or a DumpId). However, to
+ * facilitate further enhancements and because sooner or later we need to
+ * convert this to a string and send it via a message anyway, we stick with
+ * char *. It is parsed on the other side by the _EndMasterParallel()
+ * function of the respective dump format.
+ */
+static char *
+_WorkerJobDumpDirectory(ArchiveHandle *AH, TocEntry *te)
+{
+       /*
+        * short fixed-size string + some ID so far, this needs to be malloc'ed
+        * instead of static because we work with threads on windows
+        */
+       const int       buflen = 64;
+       char       *buf = (char *) pg_malloc(buflen);
+       lclTocEntry *tctx = (lclTocEntry *) te->formatData;
+
+       /* This should never happen */
+       if (!tctx)
+               exit_horribly(modulename, "Error during backup\n");
+
+       /*
+        * This function returns void. We either fail and die horribly or
+        * succeed... A failure will be detected by the parent when the child dies
+        * unexpectedly.
+        */
+       WriteDataChunksForTocEntry(AH, te);
+
+       snprintf(buf, buflen, "OK DUMP %d", te->dumpId);
+
+       return buf;
+}
+
+/*
+ * This function is executed in the child of a parallel backup for the
+ * directory archive and dumps the actual data.
+ */
+static char *
+_WorkerJobRestoreDirectory(ArchiveHandle *AH, TocEntry *te)
+{
+       /*
+        * short fixed-size string + some ID so far, this needs to be malloc'ed
+        * instead of static because we work with threads on windows
+        */
+       const int       buflen = 64;
+       char       *buf = (char *) pg_malloc(buflen);
+       ParallelArgs pargs;
+       int                     status;
+
+       pargs.AH = AH;
+       pargs.te = te;
+
+       status = parallel_restore(&pargs);
+
+       snprintf(buf, buflen, "OK RESTORE %d %d %d", te->dumpId, status,
+                        status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0);
  
         return buf;
  }
+
+/*
+ * This function is executed in the parent process. It analyzes the response of
+ * the _WorkerJobDumpDirectory/_WorkerJobRestoreDirectory functions of the
+ * respective dump format.
+ */
+static int
+_MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act)
+{
+       DumpId          dumpId;
+       int                     nBytes,
+                               n_errors;
+       int                     status = 0;
+
+       if (act == ACT_DUMP)
+       {
+               sscanf(str, "%u%n", &dumpId, &nBytes);
+
+               Assert(dumpId == te->dumpId);
+               Assert(nBytes == strlen(str));
+       }
+       else if (act == ACT_RESTORE)
+       {
+               sscanf(str, "%u %u %u%n", &dumpId, &status, &n_errors, &nBytes);
+
+               Assert(dumpId == te->dumpId);
+               Assert(nBytes == strlen(str));
+
+               AH->public.n_errors += n_errors;
+       }
+
+       return status;
+}
diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c

index 03ae4f82bcead174415d30066ddb260380724d7a..6465ac3e6c4df520ccb340879b5a896b89ed3fc3 100644 (file)
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -158,6 +158,12 @@ InitArchiveFmt_Tar(ArchiveHandle *AH)
         AH->ClonePtr = NULL;
         AH->DeClonePtr = NULL;
  
+       AH->MasterStartParallelItemPtr = NULL;
+       AH->MasterEndParallelItemPtr = NULL;
+
+       AH->WorkerJobDumpPtr = NULL;
+       AH->WorkerJobRestorePtr = NULL;
+
         /*
          * Set up some special context used in compressing data.
          */
@@ -828,7 +834,7 @@ _CloseArchive(ArchiveHandle *AH)
                 /*
                  * Now send the data (tables & blobs)
                  */
-               WriteDataChunks(AH);
+               WriteDataChunks(AH, NULL);
  
                 /*
                  * Now this format wants to append a script which does a full restore
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c

index 093be9e16d0671c232620817cde7bd972cf1921c..b50e54062283da863785b439e9da3e5bd890b114 100644 (file)
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -135,6 +135,7 @@ static int  disable_dollar_quoting = 0;
  static int     dump_inserts = 0;
  static int     column_inserts = 0;
  static int     no_security_labels = 0;
+static int     no_synchronized_snapshots = 0;
  static int     no_unlogged_table_data = 0;
  static int     serializable_deferrable = 0;
  
@@ -243,8 +244,6 @@ static Oid  findLastBuiltinOid_V70(Archive *fout);
  static void selectSourceSchema(Archive *fout, const char *schemaName);
  static char *getFormattedTypeName(Archive *fout, Oid oid, OidOptions opts);
  static char *myFormatType(const char *typname, int32 typmod);
-static const char *fmtQualifiedId(Archive *fout,
-                          const char *schema, const char *id);
  static void getBlobs(Archive *fout);
  static void dumpBlob(Archive *fout, BlobInfo *binfo);
  static int     dumpBlobs(Archive *fout, void *arg);
@@ -262,8 +261,10 @@ static void binary_upgrade_extension_member(PQExpBuffer upgrade_buffer,
                                                                 DumpableObject *dobj,
                                                                 const char *objlabel);
  static const char *getAttrName(int attrnum, TableInfo *tblInfo);
-static const char *fmtCopyColumnList(const TableInfo *ti);
+static const char *fmtCopyColumnList(const TableInfo *ti, PQExpBuffer buffer);
+static char *get_synchronized_snapshot(Archive *fout);
  static PGresult *ExecuteSqlQueryForSingleRow(Archive *fout, char *query);
+static void setupDumpWorker(Archive *AHX, RestoreOptions *ropt);
  
  
  int
@@ -284,6 +285,7 @@ main(int argc, char **argv)
         int                     numObjs;
         DumpableObject *boundaryObjs;
         int                     i;
+       int                     numWorkers = 1;
         enum trivalue prompt_password = TRI_DEFAULT;
         int                     compressLevel = -1;
         int                     plainText = 0;
@@ -314,6 +316,7 @@ main(int argc, char **argv)
                 {"format", required_argument, NULL, 'F'},
                 {"host", required_argument, NULL, 'h'},
                 {"ignore-version", no_argument, NULL, 'i'},
+               {"jobs", 1, NULL, 'j'},
                 {"no-reconnect", no_argument, NULL, 'R'},
                 {"oids", no_argument, NULL, 'o'},
                 {"no-owner", no_argument, NULL, 'O'},
@@ -353,6 +356,7 @@ main(int argc, char **argv)
                 {"serializable-deferrable", no_argument, &serializable_deferrable, 1},
                 {"use-set-session-authorization", no_argument, &use_setsessauth, 1},
                 {"no-security-labels", no_argument, &no_security_labels, 1},
+               {"no-synchronized-snapshots", no_argument, &no_synchronized_snapshots, 1},
                 {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1},
  
                 {NULL, 0, NULL, 0}
@@ -360,6 +364,12 @@ main(int argc, char **argv)
  
         set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_dump"));
  
+       /*
+        * Initialize what we need for parallel execution, especially for thread
+        * support on Windows.
+        */
+       init_parallel_dump_utils();
+
         g_verbose = false;
  
         strcpy(g_comment_start, "-- ");
@@ -390,7 +400,7 @@ main(int argc, char **argv)
                 }
         }
  
-       while ((c = getopt_long(argc, argv, "abcCd:E:f:F:h:iK:n:N:oOp:RsS:t:T:U:vwWxZ:",
+       while ((c = getopt_long(argc, argv, "abcCd:E:f:F:h:ij:K:n:N:oOp:RsS:t:T:U:vwWxZ:",
                                                         long_options, &optindex)) != -1)
         {
                 switch (c)
@@ -435,6 +445,10 @@ main(int argc, char **argv)
                                 /* ignored, deprecated option */
                                 break;
  
+                       case 'j':                       /* number of dump jobs */
+                               numWorkers = atoi(optarg);
+                               break;
+
                         case 'n':                       /* include schema(s) */
                                 simple_string_list_append(&schema_include_patterns, optarg);
                                 include_everything = false;
@@ -577,8 +591,25 @@ main(int argc, char **argv)
                         compressLevel = 0;
         }
  
+       /*
+        * On Windows we can only have at most MAXIMUM_WAIT_OBJECTS (= 64 usually)
+        * parallel jobs because that's the maximum limit for the
+        * WaitForMultipleObjects() call.
+        */
+       if (numWorkers <= 0
+#ifdef WIN32
+               || numWorkers > MAXIMUM_WAIT_OBJECTS
+#endif
+               )
+               exit_horribly(NULL, "%s: invalid number of parallel jobs\n", progname);
+
+       /* Parallel backup only in the directory archive format so far */
+       if (archiveFormat != archDirectory && numWorkers > 1)
+               exit_horribly(NULL, "parallel backup only supported by the directory format\n");
+
         /* Open the output file */
-       fout = CreateArchive(filename, archiveFormat, compressLevel, archiveMode);
+       fout = CreateArchive(filename, archiveFormat, compressLevel, archiveMode,
+                                                setupDumpWorker);
  
         /* Register the cleanup hook */
         on_exit_close_archive(fout);
@@ -600,6 +631,8 @@ main(int argc, char **argv)
         fout->minRemoteVersion = 70000;
         fout->maxRemoteVersion = (my_version / 100) * 100 + 99;
  
+       fout->numWorkers = numWorkers;
+
         /*
          * Open the database using the Archiver, so it knows about it. Errors mean
          * death.
@@ -620,7 +653,8 @@ main(int argc, char **argv)
          */
         if (fout->remoteVersion >= 90000)
         {
-               PGresult *res = ExecuteSqlQueryForSingleRow(fout, "SELECT pg_catalog.pg_is_in_recovery()");
+               PGresult   *res = ExecuteSqlQueryForSingleRow(fout, "SELECT pg_catalog.pg_is_in_recovery()");
+
                 if (strcmp(PQgetvalue(res, 0, 0), "t") == 0)
                 {
                         /*
@@ -632,32 +666,6 @@ main(int argc, char **argv)
                 PQclear(res);
         }
  
-       /*
-        * Start transaction-snapshot mode transaction to dump consistent data.
-        */
-       ExecuteSqlStatement(fout, "BEGIN");
-       if (fout->remoteVersion >= 90100)
-       {
-               if (serializable_deferrable)
-                       ExecuteSqlStatement(fout,
-                                                               "SET TRANSACTION ISOLATION LEVEL "
-                                                               "SERIALIZABLE, READ ONLY, DEFERRABLE");
-               else
-                       ExecuteSqlStatement(fout,
-                                                               "SET TRANSACTION ISOLATION LEVEL "
-                                                               "REPEATABLE READ, READ ONLY");
-       }
-       else if (fout->remoteVersion >= 70400)
-       {
-               /* note: comma was not accepted in SET TRANSACTION before 8.0 */
-               ExecuteSqlStatement(fout,
-                                                       "SET TRANSACTION ISOLATION LEVEL "
-                                                       "SERIALIZABLE READ ONLY");
-       }
-       else
-               ExecuteSqlStatement(fout,
-                                                       "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE");
-
         /* Select the appropriate subquery to convert user IDs to names */
         if (fout->remoteVersion >= 80100)
                 username_subquery = "SELECT rolname FROM pg_catalog.pg_roles WHERE oid =";
@@ -666,6 +674,14 @@ main(int argc, char **argv)
         else
                 username_subquery = "SELECT usename FROM pg_user WHERE usesysid =";
  
+       /* check the version for the synchronized snapshots feature */
+       if (numWorkers > 1 && fout->remoteVersion < 90200
+               && !no_synchronized_snapshots)
+               exit_horribly(NULL,
+                         "No synchronized snapshots available in this server version.\n"
+                          "Run with --no-synchronized-snapshots instead if you do not\n"
+                                         "need synchronized snapshots.\n");
+
         /* Find the last built-in OID, if needed */
         if (fout->remoteVersion < 70300)
         {
@@ -763,6 +779,10 @@ main(int argc, char **argv)
         else
                 sortDumpableObjectsByTypeOid(dobjs, numObjs);
  
+       /* If we do a parallel dump, we want the largest tables to go first */
+       if (archiveFormat == archDirectory && numWorkers > 1)
+               sortDataAndIndexObjectsBySize(dobjs, numObjs);
+
         sortDumpableObjects(dobjs, numObjs,
                                                 boundaryObjs[0].dumpId, boundaryObjs[1].dumpId);
  
@@ -810,9 +830,9 @@ main(int argc, char **argv)
         SetArchiveRestoreOptions(fout, ropt);
  
         /*
-        * The archive's TOC entries are now marked as to which ones will
-        * actually be output, so we can set up their dependency lists properly.
-        * This isn't necessary for plain-text output, though.
+        * The archive's TOC entries are now marked as to which ones will actually
+        * be output, so we can set up their dependency lists properly. This isn't
+        * necessary for plain-text output, though.
          */
         if (!plainText)
                 BuildArchiveDependencies(fout);
@@ -844,6 +864,7 @@ help(const char *progname)
         printf(_("  -f, --file=FILENAME          output file or directory name\n"));
         printf(_("  -F, --format=c|d|t|p         output file format (custom, directory, tar,\n"
                          "                               plain text (default))\n"));
+       printf(_("  -j, --jobs=NUM               use this many parallel jobs to dump\n"));
         printf(_("  -v, --verbose                verbose mode\n"));
         printf(_("  -V, --version                output version information, then exit\n"));
         printf(_("  -Z, --compress=0-9           compression level for compressed formats\n"));
@@ -873,6 +894,7 @@ help(const char *progname)
         printf(_("  --exclude-table-data=TABLE   do NOT dump data for the named table(s)\n"));
         printf(_("  --inserts                    dump data as INSERT commands, rather than COPY\n"));
         printf(_("  --no-security-labels         do not dump security label assignments\n"));
+       printf(_("  --no-synchronized-snapshots parallel processes should not use synchronized snapshots\n"));
         printf(_("  --no-tablespaces             do not dump tablespace assignments\n"));
         printf(_("  --no-unlogged-table-data     do not dump unlogged table data\n"));
         printf(_("  --quote-all-identifiers      quote all identifiers, even if not key words\n"));
@@ -902,7 +924,12 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
         PGconn     *conn = GetConnection(AH);
         const char *std_strings;
  
-       /* Set the client encoding if requested */
+       /*
+        * Set the client encoding if requested. If dumpencoding == NULL then
+        * either it hasn't been requested or we're a cloned connection and then
+        * this has already been set in CloneArchive according to the original
+        * connection encoding.
+        */
         if (dumpencoding)
         {
                 if (PQsetClientEncoding(conn, dumpencoding) < 0)
@@ -919,6 +946,10 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
         std_strings = PQparameterStatus(conn, "standard_conforming_strings");
         AH->std_strings = (std_strings && strcmp(std_strings, "on") == 0);
  
+       /* Set the role if requested */
+       if (!use_role && AH->use_role)
+               use_role = AH->use_role;
+
         /* Set the role if requested */
         if (use_role && AH->remoteVersion >= 80100)
         {
@@ -927,6 +958,10 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
                 appendPQExpBuffer(query, "SET ROLE %s", fmtId(use_role));
                 ExecuteSqlStatement(AH, query->data);
                 destroyPQExpBuffer(query);
+
+               /* save this for later use on parallel connections */
+               if (!AH->use_role)
+                       AH->use_role = strdup(use_role);
         }
  
         /* Set the datestyle to ISO to ensure the dump's portability */
@@ -965,6 +1000,68 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role)
          */
         if (quote_all_identifiers && AH->remoteVersion >= 90100)
                 ExecuteSqlStatement(AH, "SET quote_all_identifiers = true");
+
+       /*
+        * Start transaction-snapshot mode transaction to dump consistent data.
+        */
+       ExecuteSqlStatement(AH, "BEGIN");
+       if (AH->remoteVersion >= 90100)
+       {
+               if (serializable_deferrable)
+                       ExecuteSqlStatement(AH,
+                                                               "SET TRANSACTION ISOLATION LEVEL "
+                                                               "SERIALIZABLE, READ ONLY, DEFERRABLE");
+               else
+                       ExecuteSqlStatement(AH,
+                                                               "SET TRANSACTION ISOLATION LEVEL "
+                                                               "REPEATABLE READ, READ ONLY");
+       }
+       else if (AH->remoteVersion >= 70400)
+       {
+               /* note: comma was not accepted in SET TRANSACTION before 8.0 */
+               ExecuteSqlStatement(AH,
+                                                       "SET TRANSACTION ISOLATION LEVEL "
+                                                       "SERIALIZABLE READ ONLY");
+       }
+       else
+               ExecuteSqlStatement(AH,
+                                                       "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE");
+
+
+
+       if (AH->numWorkers > 1 && AH->remoteVersion >= 90200 && !no_synchronized_snapshots)
+       {
+               if (AH->sync_snapshot_id)
+               {
+                       PQExpBuffer query = createPQExpBuffer();
+
+                       appendPQExpBuffer(query, "SET TRANSACTION SNAPSHOT ");
+                       appendStringLiteralConn(query, AH->sync_snapshot_id, conn);
+                       destroyPQExpBuffer(query);
+               }
+               else
+                       AH->sync_snapshot_id = get_synchronized_snapshot(AH);
+       }
+}
+
+static void
+setupDumpWorker(Archive *AHX, RestoreOptions *ropt)
+{
+       setup_connection(AHX, NULL, NULL);
+}
+
+static char *
+get_synchronized_snapshot(Archive *fout)
+{
+       char       *query = "SELECT pg_export_snapshot()";
+       char       *result;
+       PGresult   *res;
+
+       res = ExecuteSqlQueryForSingleRow(fout, query);
+       result = strdup(PQgetvalue(res, 0, 0));
+       PQclear(res);
+
+       return result;
  }
  
  static ArchiveFormat
@@ -1080,7 +1177,7 @@ expand_table_name_patterns(Archive *fout,
                                                   "SELECT c.oid"
                                                   "\nFROM pg_catalog.pg_class c"
                 "\n     LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace"
-                                                 "\nWHERE c.relkind in ('%c', '%c', '%c', '%c', '%c')\n",
+                                        "\nWHERE c.relkind in ('%c', '%c', '%c', '%c', '%c')\n",
                                                   RELKIND_RELATION, RELKIND_SEQUENCE, RELKIND_VIEW,
                                                   RELKIND_MATVIEW, RELKIND_FOREIGN_TABLE);
                 processSQLNamePattern(GetConnection(fout), query, cell->val, true,
@@ -1282,6 +1379,12 @@ dumpTableData_copy(Archive *fout, void *dcontext)
         const bool      hasoids = tbinfo->hasoids;
         const bool      oids = tdinfo->oids;
         PQExpBuffer q = createPQExpBuffer();
+
+       /*
+        * Note: can't use getThreadLocalPQExpBuffer() here, we're calling fmtId
+        * which uses it already.
+        */
+       PQExpBuffer clistBuf = createPQExpBuffer();
         PGconn     *conn = GetConnection(fout);
         PGresult   *res;
         int                     ret;
@@ -1306,14 +1409,14 @@ dumpTableData_copy(Archive *fout, void *dcontext)
          * cases involving ADD COLUMN and inheritance.)
          */
         if (fout->remoteVersion >= 70300)
-               column_list = fmtCopyColumnList(tbinfo);
+               column_list = fmtCopyColumnList(tbinfo, clistBuf);
         else
                 column_list = "";               /* can't select columns in COPY */
  
         if (oids && hasoids)
         {
                 appendPQExpBuffer(q, "COPY %s %s WITH OIDS TO stdout;",
-                                                 fmtQualifiedId(fout,
+                                                 fmtQualifiedId(fout->remoteVersion,
                                                                                  tbinfo->dobj.namespace->dobj.name,
                                                                                  classname),
                                                   column_list);
@@ -1331,7 +1434,7 @@ dumpTableData_copy(Archive *fout, void *dcontext)
                 else
                         appendPQExpBufferStr(q, "* ");
                 appendPQExpBuffer(q, "FROM %s %s) TO stdout;",
-                                                 fmtQualifiedId(fout,
+                                                 fmtQualifiedId(fout->remoteVersion,
                                                                                  tbinfo->dobj.namespace->dobj.name,
                                                                                  classname),
                                                   tdinfo->filtercond);
@@ -1339,13 +1442,14 @@ dumpTableData_copy(Archive *fout, void *dcontext)
         else
         {
                 appendPQExpBuffer(q, "COPY %s %s TO stdout;",
-                                                 fmtQualifiedId(fout,
+                                                 fmtQualifiedId(fout->remoteVersion,
                                                                                  tbinfo->dobj.namespace->dobj.name,
                                                                                  classname),
                                                   column_list);
         }
         res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT);
         PQclear(res);
+       destroyPQExpBuffer(clistBuf);
  
         for (;;)
         {
@@ -1464,7 +1568,7 @@ dumpTableData_insert(Archive *fout, void *dcontext)
         {
                 appendPQExpBuffer(q, "DECLARE _pg_dump_cursor CURSOR FOR "
                                                   "SELECT * FROM ONLY %s",
-                                                 fmtQualifiedId(fout,
+                                                 fmtQualifiedId(fout->remoteVersion,
                                                                                  tbinfo->dobj.namespace->dobj.name,
                                                                                  classname));
         }
@@ -1472,7 +1576,7 @@ dumpTableData_insert(Archive *fout, void *dcontext)
         {
                 appendPQExpBuffer(q, "DECLARE _pg_dump_cursor CURSOR FOR "
                                                   "SELECT * FROM %s",
-                                                 fmtQualifiedId(fout,
+                                                 fmtQualifiedId(fout->remoteVersion,
                                                                                  tbinfo->dobj.namespace->dobj.name,
                                                                                  classname));
         }
@@ -1604,6 +1708,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
  {
         TableInfo  *tbinfo = tdinfo->tdtable;
         PQExpBuffer copyBuf = createPQExpBuffer();
+       PQExpBuffer clistBuf = createPQExpBuffer();
         DataDumperPtr dumpFn;
         char       *copyStmt;
  
@@ -1615,7 +1720,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
                 appendPQExpBuffer(copyBuf, "COPY %s ",
                                                   fmtId(tbinfo->dobj.name));
                 appendPQExpBuffer(copyBuf, "%s %sFROM stdin;\n",
-                                                 fmtCopyColumnList(tbinfo),
+                                                 fmtCopyColumnList(tbinfo, clistBuf),
                                           (tdinfo->oids && tbinfo->hasoids) ? "WITH OIDS " : "");
                 copyStmt = copyBuf->data;
         }
@@ -1640,6 +1745,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
                                  dumpFn, tdinfo);
  
         destroyPQExpBuffer(copyBuf);
+       destroyPQExpBuffer(clistBuf);
  }
  
  /*
@@ -1665,22 +1771,22 @@ refreshMatViewData(Archive *fout, TableDataInfo *tdinfo)
                                           fmtId(tbinfo->dobj.name));
  
         ArchiveEntry(fout,
-                                tdinfo->dobj.catId,                                    /* catalog ID */
-                                tdinfo->dobj.dumpId,                                   /* dump ID */
-                                tbinfo->dobj.name,                                             /* Name */
-                                tbinfo->dobj.namespace->dobj.name,     /* Namespace */
-                                NULL,                                                                  /* Tablespace */
-                                tbinfo->rolname,                                               /* Owner */
-                                false,                                                                 /* with oids */
-                                "MATERIALIZED VIEW DATA",                              /* Desc */
-                                SECTION_POST_DATA,                                             /* Section */
-                                q->data,                                                               /* Create */
-                                "",                                                                    /* Del */
-                                NULL,                                                                  /* Copy */
-                                tdinfo->dobj.dependencies,                             /* Deps */
-                                tdinfo->dobj.nDeps,                                    /* # Deps */
-                                NULL,                                                                  /* Dumper */
-                                NULL);                                                                 /* Dumper Arg */
+                                tdinfo->dobj.catId,    /* catalog ID */
+                                tdinfo->dobj.dumpId,   /* dump ID */
+                                tbinfo->dobj.name,             /* Name */
+                                tbinfo->dobj.namespace->dobj.name,             /* Namespace */
+                                NULL,                  /* Tablespace */
+                                tbinfo->rolname,               /* Owner */
+                                false,                 /* with oids */
+                                "MATERIALIZED VIEW DATA",              /* Desc */
+                                SECTION_POST_DATA,             /* Section */
+                                q->data,               /* Create */
+                                "",                    /* Del */
+                                NULL,                  /* Copy */
+                                tdinfo->dobj.dependencies,             /* Deps */
+                                tdinfo->dobj.nDeps,    /* # Deps */
+                                NULL,                  /* Dumper */
+                                NULL);                 /* Dumper Arg */
  
         destroyPQExpBuffer(q);
  }
@@ -1790,12 +1896,12 @@ buildMatViewRefreshDependencies(Archive *fout)
  
         appendPQExpBuffer(query, "with recursive w as "
                                           "( "
-                                         "select d1.objid, d2.refobjid, c2.relkind as refrelkind "
+                                       "select d1.objid, d2.refobjid, c2.relkind as refrelkind "
                                           "from pg_depend d1 "
                                           "join pg_class c1 on c1.oid = d1.objid "
                                           "and c1.relkind = 'm' "
                                           "join pg_rewrite r1 on r1.ev_class = d1.objid "
-                                         "join pg_depend d2 on d2.classid = 'pg_rewrite'::regclass "
+                                 "join pg_depend d2 on d2.classid = 'pg_rewrite'::regclass "
                                           "and d2.objid = r1.oid "
                                           "and d2.refobjid <> d1.objid "
                                           "join pg_class c2 on c2.oid = d2.refobjid "
@@ -1805,13 +1911,13 @@ buildMatViewRefreshDependencies(Archive *fout)
                                           "select w.objid, d3.refobjid, c3.relkind "
                                           "from w "
                                           "join pg_rewrite r3 on r3.ev_class = w.refobjid "
-                                         "join pg_depend d3 on d3.classid = 'pg_rewrite'::regclass "
+                                 "join pg_depend d3 on d3.classid = 'pg_rewrite'::regclass "
                                           "and d3.objid = r3.oid "
                                           "and d3.refobjid <> w.refobjid "
                                           "join pg_class c3 on c3.oid = d3.refobjid "
                                           "and c3.relkind in ('m','v') "
                                           ") "
-                                         "select 'pg_class'::regclass::oid as classid, objid, refobjid "
+                         "select 'pg_class'::regclass::oid as classid, objid, refobjid "
                                           "from w "
                                           "where refrelkind = 'm'");
  
@@ -1827,10 +1933,10 @@ buildMatViewRefreshDependencies(Archive *fout)
         {
                 CatalogId       objId;
                 CatalogId       refobjId;
-               DumpableObject  *dobj;
-               DumpableObject  *refdobj;
-               TableInfo          *tbinfo;
-               TableInfo          *reftbinfo;
+               DumpableObject *dobj;
+               DumpableObject *refdobj;
+               TableInfo  *tbinfo;
+               TableInfo  *reftbinfo;
  
                 objId.tableoid = atooid(PQgetvalue(res, i, i_classid));
                 objId.oid = atooid(PQgetvalue(res, i, i_objid));
@@ -3760,7 +3866,7 @@ getAggregates(Archive *fout, int *numAggs)
                 appendPQExpBuffer(query, "SELECT tableoid, oid, proname AS aggname, "
                                                   "pronamespace AS aggnamespace, "
                                                   "pronargs, proargtypes, "
-                                                 "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs,"
+                       "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs,"
                                                   "(%s proowner) AS rolname, "
                                                   "proacl AS aggacl "
                                                   "FROM pg_proc p "
@@ -3953,7 +4059,7 @@ getFuncs(Archive *fout, int *numFuncs)
                                                   "SELECT tableoid, oid, proname, prolang, "
                                                   "pronargs, proargtypes, prorettype, proacl, "
                                                   "pronamespace, "
-                                                 "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs,"
+                       "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs,"
                                                   "(%s proowner) AS rolname "
                                                   "FROM pg_proc p "
                                                   "WHERE NOT proisagg AND ("
@@ -4122,6 +4228,7 @@ getTables(Archive *fout, int *numTables)
         int                     i_reloptions;
         int                     i_toastreloptions;
         int                     i_reloftype;
+       int                     i_relpages;
  
         /* Make sure we are in proper schema */
         selectSourceSchema(fout, "pg_catalog");
@@ -4160,7 +4267,8 @@ getTables(Archive *fout, int *numTables)
                                                   "c.relhasindex, c.relhasrules, c.relhasoids, "
                                                   "c.relfrozenxid, tc.oid AS toid, "
                                                   "tc.relfrozenxid AS tfrozenxid, "
-                                                 "c.relpersistence, pg_relation_is_scannable(c.oid) as isscannable, "
+                "c.relpersistence, pg_relation_is_scannable(c.oid) as isscannable, "
+                                                 "c.relpages, "
                                                   "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, "
                                                   "d.refobjid AS owning_tab, "
                                                   "d.refobjsubid AS owning_col, "
@@ -4174,7 +4282,7 @@ getTables(Archive *fout, int *numTables)
                                                   "d.objsubid = 0 AND "
                                                   "d.refclassid = c.tableoid AND d.deptype = 'a') "
                                            "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid) "
-                                                 "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') "
+                                  "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') "
                                                   "ORDER BY c.oid",
                                                   username_subquery,
                                                   RELKIND_SEQUENCE,
@@ -4210,7 +4318,7 @@ getTables(Archive *fout, int *numTables)
                                                   "d.objsubid = 0 AND "
                                                   "d.refclassid = c.tableoid AND d.deptype = 'a') "
                                            "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid) "
-                                                 "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') "
+                                  "WHERE c.relkind in ('%c', '%c', '%c', '%c', '%c', '%c') "
                                                   "ORDER BY c.oid",
                                                   username_subquery,
                                                   RELKIND_SEQUENCE,
@@ -4233,6 +4341,7 @@ getTables(Archive *fout, int *numTables)
                                                   "c.relfrozenxid, tc.oid AS toid, "
                                                   "tc.relfrozenxid AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "c.relpages, "
                                                   "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, "
                                                   "d.refobjid AS owning_tab, "
                                                   "d.refobjsubid AS owning_col, "
@@ -4268,6 +4377,7 @@ getTables(Archive *fout, int *numTables)
                                                   "c.relfrozenxid, tc.oid AS toid, "
                                                   "tc.relfrozenxid AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "c.relpages, "
                                                   "NULL AS reloftype, "
                                                   "d.refobjid AS owning_tab, "
                                                   "d.refobjsubid AS owning_col, "
@@ -4303,6 +4413,7 @@ getTables(Archive *fout, int *numTables)
                                                   "c.relfrozenxid, tc.oid AS toid, "
                                                   "tc.relfrozenxid AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "c.relpages, "
                                                   "NULL AS reloftype, "
                                                   "d.refobjid AS owning_tab, "
                                                   "d.refobjsubid AS owning_col, "
@@ -4339,6 +4450,7 @@ getTables(Archive *fout, int *numTables)
                                                   "0 AS toid, "
                                                   "0 AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "relpages, "
                                                   "NULL AS reloftype, "
                                                   "d.refobjid AS owning_tab, "
                                                   "d.refobjsubid AS owning_col, "
@@ -4374,6 +4486,7 @@ getTables(Archive *fout, int *numTables)
                                                   "0 AS toid, "
                                                   "0 AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "relpages, "
                                                   "NULL AS reloftype, "
                                                   "d.refobjid AS owning_tab, "
                                                   "d.refobjsubid AS owning_col, "
@@ -4405,6 +4518,7 @@ getTables(Archive *fout, int *numTables)
                                                   "0 AS toid, "
                                                   "0 AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "relpages, "
                                                   "NULL AS reloftype, "
                                                   "NULL::oid AS owning_tab, "
                                                   "NULL::int4 AS owning_col, "
@@ -4431,6 +4545,7 @@ getTables(Archive *fout, int *numTables)
                                                   "0 AS toid, "
                                                   "0 AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "relpages, "
                                                   "NULL AS reloftype, "
                                                   "NULL::oid AS owning_tab, "
                                                   "NULL::int4 AS owning_col, "
@@ -4467,6 +4582,7 @@ getTables(Archive *fout, int *numTables)
                                                   "0 AS toid, "
                                                   "0 AS tfrozenxid, "
                                                   "'p' AS relpersistence, 't'::bool as isscannable, "
+                                                 "0 AS relpages, "
                                                   "NULL AS reloftype, "
                                                   "NULL::oid AS owning_tab, "
                                                   "NULL::int4 AS owning_col, "
@@ -4515,6 +4631,7 @@ getTables(Archive *fout, int *numTables)
         i_toastfrozenxid = PQfnumber(res, "tfrozenxid");
         i_relpersistence = PQfnumber(res, "relpersistence");
         i_isscannable = PQfnumber(res, "isscannable");
+       i_relpages = PQfnumber(res, "relpages");
         i_owning_tab = PQfnumber(res, "owning_tab");
         i_owning_col = PQfnumber(res, "owning_col");
         i_reltablespace = PQfnumber(res, "reltablespace");
@@ -4557,6 +4674,7 @@ getTables(Archive *fout, int *numTables)
                 tblinfo[i].hastriggers = (strcmp(PQgetvalue(res, i, i_relhastriggers), "t") == 0);
                 tblinfo[i].hasoids = (strcmp(PQgetvalue(res, i, i_relhasoids), "t") == 0);
                 tblinfo[i].isscannable = (strcmp(PQgetvalue(res, i, i_isscannable), "t") == 0);
+               tblinfo[i].relpages = atoi(PQgetvalue(res, i, i_relpages));
                 tblinfo[i].frozenxid = atooid(PQgetvalue(res, i, i_relfrozenxid));
                 tblinfo[i].toast_oid = atooid(PQgetvalue(res, i, i_toastoid));
                 tblinfo[i].toast_frozenxid = atooid(PQgetvalue(res, i, i_toastfrozenxid));
@@ -4606,7 +4724,7 @@ getTables(Archive *fout, int *numTables)
                         resetPQExpBuffer(query);
                         appendPQExpBuffer(query,
                                                           "LOCK TABLE %s IN ACCESS SHARE MODE",
-                                                         fmtQualifiedId(fout,
+                                                         fmtQualifiedId(fout->remoteVersion,
                                                                                 tblinfo[i].dobj.namespace->dobj.name,
                                                                                          tblinfo[i].dobj.name));
                         ExecuteSqlStatement(fout, query->data);
@@ -4745,7 +4863,8 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                 i_conoid,
                                 i_condef,
                                 i_tablespace,
-                               i_options;
+                               i_options,
+                               i_relpages;
         int                     ntups;
  
         for (i = 0; i < numTables; i++)
@@ -4790,6 +4909,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                          "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                                                           "t.relnatts AS indnkeys, "
                                                           "i.indkey, i.indisclustered, "
+                                                         "t.relpages, "
                                                           "c.contype, c.conname, "
                                                           "c.condeferrable, c.condeferred, "
                                                           "c.tableoid AS contableoid, "
@@ -4815,6 +4935,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                          "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                                                           "t.relnatts AS indnkeys, "
                                                           "i.indkey, i.indisclustered, "
+                                                         "t.relpages, "
                                                           "c.contype, c.conname, "
                                                           "c.condeferrable, c.condeferred, "
                                                           "c.tableoid AS contableoid, "
@@ -4843,6 +4964,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                          "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                                                           "t.relnatts AS indnkeys, "
                                                           "i.indkey, i.indisclustered, "
+                                                         "t.relpages, "
                                                           "c.contype, c.conname, "
                                                           "c.condeferrable, c.condeferred, "
                                                           "c.tableoid AS contableoid, "
@@ -4871,6 +4993,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                          "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                                                           "t.relnatts AS indnkeys, "
                                                           "i.indkey, i.indisclustered, "
+                                                         "t.relpages, "
                                                           "c.contype, c.conname, "
                                                           "c.condeferrable, c.condeferred, "
                                                           "c.tableoid AS contableoid, "
@@ -4899,6 +5022,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                                           "pg_get_indexdef(i.indexrelid) AS indexdef, "
                                                           "t.relnatts AS indnkeys, "
                                                           "i.indkey, false AS indisclustered, "
+                                                         "t.relpages, "
                                                           "CASE WHEN i.indisprimary THEN 'p'::char "
                                                           "ELSE '0'::char END AS contype, "
                                                           "t.relname AS conname, "
@@ -4925,6 +5049,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                                           "pg_get_indexdef(i.indexrelid) AS indexdef, "
                                                           "t.relnatts AS indnkeys, "
                                                           "i.indkey, false AS indisclustered, "
+                                                         "t.relpages, "
                                                           "CASE WHEN i.indisprimary THEN 'p'::char "
                                                           "ELSE '0'::char END AS contype, "
                                                           "t.relname AS conname, "
@@ -4953,6 +5078,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                 i_indnkeys = PQfnumber(res, "indnkeys");
                 i_indkey = PQfnumber(res, "indkey");
                 i_indisclustered = PQfnumber(res, "indisclustered");
+               i_relpages = PQfnumber(res, "relpages");
                 i_contype = PQfnumber(res, "contype");
                 i_conname = PQfnumber(res, "conname");
                 i_condeferrable = PQfnumber(res, "condeferrable");
@@ -4995,6 +5121,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                         parseOidArray(PQgetvalue(res, j, i_indkey),
                                                   indxinfo[j].indkeys, INDEX_MAX_KEYS);
                         indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't');
+                       indxinfo[j].relpages = atoi(PQgetvalue(res, j, i_relpages));
                         contype = *(PQgetvalue(res, j, i_contype));
  
                         if (contype == 'p' || contype == 'u' || contype == 'x')
@@ -5345,7 +5472,7 @@ getRules(Archive *fout, int *numRules)
                          * table.
                          */
                         if ((ruleinfo[i].ruletable->relkind == RELKIND_VIEW ||
-                            ruleinfo[i].ruletable->relkind == RELKIND_MATVIEW) &&
+                                ruleinfo[i].ruletable->relkind == RELKIND_MATVIEW) &&
                                 ruleinfo[i].ev_type == '1' && ruleinfo[i].is_instead)
                         {
                                 addObjectDependency(&ruleinfo[i].ruletable->dobj,
@@ -7733,11 +7860,11 @@ dumpExtension(Archive *fout, ExtensionInfo *extinfo)
                 appendPQExpBuffer(q, "-- For binary upgrade, create an empty extension and insert objects into it\n");
  
                 /*
-                *      We unconditionally create the extension, so we must drop it if it
-                *      exists.  This could happen if the user deleted 'plpgsql' and then
-                *      readded it, causing its oid to be greater than FirstNormalObjectId.
-                *      The FirstNormalObjectId test was kept to avoid repeatedly dropping
-                *      and recreating extensions like 'plpgsql'.
+                * We unconditionally create the extension, so we must drop it if it
+                * exists.      This could happen if the user deleted 'plpgsql' and then
+                * readded it, causing its oid to be greater than FirstNormalObjectId.
+                * The FirstNormalObjectId test was kept to avoid repeatedly dropping
+                * and recreating extensions like 'plpgsql'.
                  */
                 appendPQExpBuffer(q, "DROP EXTENSION IF EXISTS %s;\n", qextname);
  
@@ -12138,7 +12265,7 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo)
                 default:
                         /* shouldn't get here */
                         exit_horribly(NULL,
-                                                 "unrecognized object type in default privileges: %d\n",
+                                         "unrecognized object type in default privileges: %d\n",
                                                   (int) daclinfo->defaclobjtype);
                         type = "";                      /* keep compiler quiet */
         }
@@ -12611,7 +12738,7 @@ createViewAsClause(Archive *fout, TableInfo *tbinfo)
         {
                 /* Beginning in 7.3, viewname is not unique; rely on OID */
                 appendPQExpBuffer(query,
-                                                 "SELECT pg_catalog.pg_get_viewdef('%u'::pg_catalog.oid) AS viewdef",
+                "SELECT pg_catalog.pg_get_viewdef('%u'::pg_catalog.oid) AS viewdef",
                                                   tbinfo->dobj.catId.oid);
         }
         else
@@ -12641,7 +12768,7 @@ createViewAsClause(Archive *fout, TableInfo *tbinfo)
                                           tbinfo->dobj.name);
  
         /* Strip off the trailing semicolon so that other things may follow. */
-       Assert(PQgetvalue(res, 0, 0)[len-1] == ';');
+       Assert(PQgetvalue(res, 0, 0)[len - 1] == ';');
         appendBinaryPQExpBuffer(result, PQgetvalue(res, 0, 0), len - 1);
  
         PQclear(res);
@@ -12712,37 +12839,37 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
                 switch (tbinfo->relkind)
                 {
                         case (RELKIND_FOREIGN_TABLE):
-                       {
-                               PQExpBuffer query = createPQExpBuffer();
-                               PGresult   *res;
-                               int                     i_srvname;
-                               int                     i_ftoptions;
-
-                               reltypename = "FOREIGN TABLE";
-
-                               /* retrieve name of foreign server and generic options */
-                               appendPQExpBuffer(query,
-                                                                 "SELECT fs.srvname, "
-                                                                 "pg_catalog.array_to_string(ARRAY("
-                                                                 "SELECT pg_catalog.quote_ident(option_name) || "
-                                                                 "' ' || pg_catalog.quote_literal(option_value) "
-                                                               "FROM pg_catalog.pg_options_to_table(ftoptions) "
-                                                                 "ORDER BY option_name"
-                                                                 "), E',\n    ') AS ftoptions "
-                                                                 "FROM pg_catalog.pg_foreign_table ft "
-                                                                 "JOIN pg_catalog.pg_foreign_server fs "
-                                                                 "ON (fs.oid = ft.ftserver) "
-                                                                 "WHERE ft.ftrelid = '%u'",
-                                                                 tbinfo->dobj.catId.oid);
-                               res = ExecuteSqlQueryForSingleRow(fout, query->data);
-                               i_srvname = PQfnumber(res, "srvname");
-                               i_ftoptions = PQfnumber(res, "ftoptions");
-                               srvname = pg_strdup(PQgetvalue(res, 0, i_srvname));
-                               ftoptions = pg_strdup(PQgetvalue(res, 0, i_ftoptions));
-                               PQclear(res);
-                               destroyPQExpBuffer(query);
-                               break;
-                       }
+                               {
+                                       PQExpBuffer query = createPQExpBuffer();
+                                       PGresult   *res;
+                                       int                     i_srvname;
+                                       int                     i_ftoptions;
+
+                                       reltypename = "FOREIGN TABLE";
+
+                                       /* retrieve name of foreign server and generic options */
+                                       appendPQExpBuffer(query,
+                                                                         "SELECT fs.srvname, "
+                                                                         "pg_catalog.array_to_string(ARRAY("
+                                                        "SELECT pg_catalog.quote_ident(option_name) || "
+                                                        "' ' || pg_catalog.quote_literal(option_value) "
+                                                       "FROM pg_catalog.pg_options_to_table(ftoptions) "
+                                                                         "ORDER BY option_name"
+                                                                         "), E',\n    ') AS ftoptions "
+                                                                         "FROM pg_catalog.pg_foreign_table ft "
+                                                                         "JOIN pg_catalog.pg_foreign_server fs "
+                                                                         "ON (fs.oid = ft.ftserver) "
+                                                                         "WHERE ft.ftrelid = '%u'",
+                                                                         tbinfo->dobj.catId.oid);
+                                       res = ExecuteSqlQueryForSingleRow(fout, query->data);
+                                       i_srvname = PQfnumber(res, "srvname");
+                                       i_ftoptions = PQfnumber(res, "ftoptions");
+                                       srvname = pg_strdup(PQgetvalue(res, 0, i_srvname));
+                                       ftoptions = pg_strdup(PQgetvalue(res, 0, i_ftoptions));
+                                       PQclear(res);
+                                       destroyPQExpBuffer(query);
+                                       break;
+                               }
                         case (RELKIND_MATVIEW):
                                 reltypename = "MATERIALIZED VIEW";
                                 srvname = NULL;
@@ -12788,156 +12915,158 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
  
                 if (tbinfo->relkind != RELKIND_MATVIEW)
                 {
-               /* Dump the attributes */
-               actual_atts = 0;
-               for (j = 0; j < tbinfo->numatts; j++)
-               {
-                       /*
-                        * Normally, dump if it's locally defined in this table, and not
-                        * dropped.  But for binary upgrade, we'll dump all the columns,
-                        * and then fix up the dropped and nonlocal cases below.
-                        */
-                       if (shouldPrintColumn(tbinfo, j))
+                       /* Dump the attributes */
+                       actual_atts = 0;
+                       for (j = 0; j < tbinfo->numatts; j++)
                         {
                                 /*
-                                * Default value --- suppress if to be printed separately.
+                                * Normally, dump if it's locally defined in this table, and
+                                * not dropped.  But for binary upgrade, we'll dump all the
+                                * columns, and then fix up the dropped and nonlocal cases
+                                * below.
                                  */
-                               bool            has_default = (tbinfo->attrdefs[j] != NULL &&
-                                                                                  !tbinfo->attrdefs[j]->separate);
-
-                               /*
-                                * Not Null constraint --- suppress if inherited, except in
-                                * binary-upgrade case where that won't work.
-                                */
-                               bool            has_notnull = (tbinfo->notnull[j] &&
-                                                                                  (!tbinfo->inhNotNull[j] ||
-                                                                                       binary_upgrade));
-
-                               /* Skip column if fully defined by reloftype */
-                               if (tbinfo->reloftype &&
-                                       !has_default && !has_notnull && !binary_upgrade)
-                                       continue;
-
-                               /* Format properly if not first attr */
-                               if (actual_atts == 0)
-                                       appendPQExpBuffer(q, " (");
-                               else
-                                       appendPQExpBuffer(q, ",");
-                               appendPQExpBuffer(q, "\n    ");
-                               actual_atts++;
-
-                               /* Attribute name */
-                               appendPQExpBuffer(q, "%s",
-                                                                 fmtId(tbinfo->attnames[j]));
-
-                               if (tbinfo->attisdropped[j])
+                               if (shouldPrintColumn(tbinfo, j))
                                 {
                                         /*
-                                        * ALTER TABLE DROP COLUMN clears pg_attribute.atttypid,
-                                        * so we will not have gotten a valid type name; insert
-                                        * INTEGER as a stopgap.  We'll clean things up later.
+                                        * Default value --- suppress if to be printed separately.
                                          */
-                                       appendPQExpBuffer(q, " INTEGER /* dummy */");
-                                       /* Skip all the rest, too */
-                                       continue;
-                               }
+                                       bool            has_default = (tbinfo->attrdefs[j] != NULL &&
+                                                                                        !tbinfo->attrdefs[j]->separate);
  
-                               /* Attribute type */
-                               if (tbinfo->reloftype && !binary_upgrade)
-                               {
-                                       appendPQExpBuffer(q, " WITH OPTIONS");
-                               }
-                               else if (fout->remoteVersion >= 70100)
-                               {
-                                       appendPQExpBuffer(q, " %s",
-                                                                         tbinfo->atttypnames[j]);
-                               }
-                               else
-                               {
-                                       /* If no format_type, fake it */
-                                       appendPQExpBuffer(q, " %s",
-                                                                         myFormatType(tbinfo->atttypnames[j],
-                                                                                                  tbinfo->atttypmod[j]));
-                               }
+                                       /*
+                                        * Not Null constraint --- suppress if inherited, except
+                                        * in binary-upgrade case where that won't work.
+                                        */
+                                       bool            has_notnull = (tbinfo->notnull[j] &&
+                                                                                          (!tbinfo->inhNotNull[j] ||
+                                                                                               binary_upgrade));
+
+                                       /* Skip column if fully defined by reloftype */
+                                       if (tbinfo->reloftype &&
+                                               !has_default && !has_notnull && !binary_upgrade)
+                                               continue;
+
+                                       /* Format properly if not first attr */
+                                       if (actual_atts == 0)
+                                               appendPQExpBuffer(q, " (");
+                                       else
+                                               appendPQExpBuffer(q, ",");
+                                       appendPQExpBuffer(q, "\n    ");
+                                       actual_atts++;
  
-                               /* Add collation if not default for the type */
-                               if (OidIsValid(tbinfo->attcollation[j]))
-                               {
-                                       CollInfo   *coll;
+                                       /* Attribute name */
+                                       appendPQExpBuffer(q, "%s",
+                                                                         fmtId(tbinfo->attnames[j]));
+
+                                       if (tbinfo->attisdropped[j])
+                                       {
+                                               /*
+                                                * ALTER TABLE DROP COLUMN clears
+                                                * pg_attribute.atttypid, so we will not have gotten a
+                                                * valid type name; insert INTEGER as a stopgap. We'll
+                                                * clean things up later.
+                                                */
+                                               appendPQExpBuffer(q, " INTEGER /* dummy */");
+                                               /* Skip all the rest, too */
+                                               continue;
+                                       }
+
+                                       /* Attribute type */
+                                       if (tbinfo->reloftype && !binary_upgrade)
+                                       {
+                                               appendPQExpBuffer(q, " WITH OPTIONS");
+                                       }
+                                       else if (fout->remoteVersion >= 70100)
+                                       {
+                                               appendPQExpBuffer(q, " %s",
+                                                                                 tbinfo->atttypnames[j]);
+                                       }
+                                       else
+                                       {
+                                               /* If no format_type, fake it */
+                                               appendPQExpBuffer(q, " %s",
+                                                                                 myFormatType(tbinfo->atttypnames[j],
+                                                                                                          tbinfo->atttypmod[j]));
+                                       }
  
-                                       coll = findCollationByOid(tbinfo->attcollation[j]);
-                                       if (coll)
+                                       /* Add collation if not default for the type */
+                                       if (OidIsValid(tbinfo->attcollation[j]))
                                         {
-                                               /* always schema-qualify, don't try to be smart */
-                                               appendPQExpBuffer(q, " COLLATE %s.",
+                                               CollInfo   *coll;
+
+                                               coll = findCollationByOid(tbinfo->attcollation[j]);
+                                               if (coll)
+                                               {
+                                                       /* always schema-qualify, don't try to be smart */
+                                                       appendPQExpBuffer(q, " COLLATE %s.",
                                                                          fmtId(coll->dobj.namespace->dobj.name));
-                                               appendPQExpBuffer(q, "%s",
-                                                                                 fmtId(coll->dobj.name));
+                                                       appendPQExpBuffer(q, "%s",
+                                                                                         fmtId(coll->dobj.name));
+                                               }
                                         }
-                               }
  
-                               if (has_default)
-                                       appendPQExpBuffer(q, " DEFAULT %s",
-                                                                         tbinfo->attrdefs[j]->adef_expr);
+                                       if (has_default)
+                                               appendPQExpBuffer(q, " DEFAULT %s",
+                                                                                 tbinfo->attrdefs[j]->adef_expr);
  
-                               if (has_notnull)
-                                       appendPQExpBuffer(q, " NOT NULL");
+                                       if (has_notnull)
+                                               appendPQExpBuffer(q, " NOT NULL");
+                               }
                         }
-               }
  
-               /*
-                * Add non-inherited CHECK constraints, if any.
-                */
-               for (j = 0; j < tbinfo->ncheck; j++)
-               {
-                       ConstraintInfo *constr = &(tbinfo->checkexprs[j]);
+                       /*
+                        * Add non-inherited CHECK constraints, if any.
+                        */
+                       for (j = 0; j < tbinfo->ncheck; j++)
+                       {
+                               ConstraintInfo *constr = &(tbinfo->checkexprs[j]);
  
-                       if (constr->separate || !constr->conislocal)
-                               continue;
+                               if (constr->separate || !constr->conislocal)
+                                       continue;
  
-                       if (actual_atts == 0)
-                               appendPQExpBuffer(q, " (\n    ");
-                       else
-                               appendPQExpBuffer(q, ",\n    ");
+                               if (actual_atts == 0)
+                                       appendPQExpBuffer(q, " (\n    ");
+                               else
+                                       appendPQExpBuffer(q, ",\n    ");
  
-                       appendPQExpBuffer(q, "CONSTRAINT %s ",
-                                                         fmtId(constr->dobj.name));
-                       appendPQExpBuffer(q, "%s", constr->condef);
+                               appendPQExpBuffer(q, "CONSTRAINT %s ",
+                                                                 fmtId(constr->dobj.name));
+                               appendPQExpBuffer(q, "%s", constr->condef);
  
-                       actual_atts++;
-               }
+                               actual_atts++;
+                       }
  
-               if (actual_atts)
-                       appendPQExpBuffer(q, "\n)");
-               else if (!(tbinfo->reloftype && !binary_upgrade))
-               {
-                       /*
-                        * We must have a parenthesized attribute list, even though empty,
-                        * when not using the OF TYPE syntax.
-                        */
-                       appendPQExpBuffer(q, " (\n)");
-               }
+                       if (actual_atts)
+                               appendPQExpBuffer(q, "\n)");
+                       else if (!(tbinfo->reloftype && !binary_upgrade))
+                       {
+                               /*
+                                * We must have a parenthesized attribute list, even though
+                                * empty, when not using the OF TYPE syntax.
+                                */
+                               appendPQExpBuffer(q, " (\n)");
+                       }
  
-               if (numParents > 0 && !binary_upgrade)
-               {
-                       appendPQExpBuffer(q, "\nINHERITS (");
-                       for (k = 0; k < numParents; k++)
+                       if (numParents > 0 && !binary_upgrade)
                         {
-                               TableInfo  *parentRel = parents[k];
+                               appendPQExpBuffer(q, "\nINHERITS (");
+                               for (k = 0; k < numParents; k++)
+                               {
+                                       TableInfo  *parentRel = parents[k];
  
-                               if (k > 0)
-                                       appendPQExpBuffer(q, ", ");
-                               if (parentRel->dobj.namespace != tbinfo->dobj.namespace)
-                                       appendPQExpBuffer(q, "%s.",
+                                       if (k > 0)
+                                               appendPQExpBuffer(q, ", ");
+                                       if (parentRel->dobj.namespace != tbinfo->dobj.namespace)
+                                               appendPQExpBuffer(q, "%s.",
                                                                 fmtId(parentRel->dobj.namespace->dobj.name));
-                               appendPQExpBuffer(q, "%s",
-                                                                 fmtId(parentRel->dobj.name));
+                                       appendPQExpBuffer(q, "%s",
+                                                                         fmtId(parentRel->dobj.name));
+                               }
+                               appendPQExpBuffer(q, ")");
                         }
-                       appendPQExpBuffer(q, ")");
-               }
  
-               if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
-                       appendPQExpBuffer(q, "\nSERVER %s", fmtId(srvname));
+                       if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
+                               appendPQExpBuffer(q, "\nSERVER %s", fmtId(srvname));
                 }
  
                 if ((tbinfo->reloptions && strlen(tbinfo->reloptions) > 0) ||
@@ -13853,8 +13982,8 @@ dumpSequence(Archive *fout, TableInfo *tbinfo)
  
         /*
          * If the sequence is owned by a table column, emit the ALTER for it as a
-        * separate TOC entry immediately following the sequence's own entry.
-        * It's OK to do this rather than using full sorting logic, because the
+        * separate TOC entry immediately following the sequence's own entry. It's
+        * OK to do this rather than using full sorting logic, because the
          * dependency that tells us it's owned will have forced the table to be
          * created first.  We can't just include the ALTER in the TOC entry
          * because it will fail if we haven't reassigned the sequence owner to
@@ -14763,7 +14892,7 @@ addBoundaryDependencies(DumpableObject **dobjs, int numObjs,
   * chains linking through objects that don't appear explicitly in the dump.
   * For example, a view will depend on its _RETURN rule while the _RETURN rule
   * will depend on other objects --- but the rule will not appear as a separate
- * object in the dump.  We need to adjust the view's dependencies to include
+ * object in the dump. We need to adjust the view's dependencies to include
   * whatever the rule depends on that is included in the dump.
   *
   * Just to make things more complicated, there are also "special" dependencies
@@ -14851,7 +14980,7 @@ findDumpableDependencies(ArchiveHandle *AH, DumpableObject *dobj,
                         {
                                 *allocDeps *= 2;
                                 *dependencies = (DumpId *) pg_realloc(*dependencies,
-                                                                                         *allocDeps * sizeof(DumpId));
+                                                                                               *allocDeps * sizeof(DumpId));
                         }
                         (*dependencies)[*nDeps] = depid;
                         (*nDeps)++;
@@ -14859,9 +14988,9 @@ findDumpableDependencies(ArchiveHandle *AH, DumpableObject *dobj,
                 else
                 {
                         /*
-                        * Object will not be dumped, so recursively consider its deps.
-                        * We rely on the assumption that sortDumpableObjects already
-                        * broke any dependency loops, else we might recurse infinitely.
+                        * Object will not be dumped, so recursively consider its deps. We
+                        * rely on the assumption that sortDumpableObjects already broke
+                        * any dependency loops, else we might recurse infinitely.
                          */
                         DumpableObject *otherdobj = findObjectByDumpId(depid);
  
@@ -14884,22 +15013,21 @@ findDumpableDependencies(ArchiveHandle *AH, DumpableObject *dobj,
   *
   * Whenever the selected schema is not pg_catalog, be careful to qualify
   * references to system catalogs and types in our emitted commands!
+ *
+ * This function is called only from selectSourceSchemaOnAH and
+ * selectSourceSchema.
   */
  static void
  selectSourceSchema(Archive *fout, const char *schemaName)
  {
-       static char *curSchemaName = NULL;
         PQExpBuffer query;
  
+       /* This is checked by the callers already */
+       Assert(schemaName != NULL && *schemaName != '\0');
+
         /* Not relevant if fetching from pre-7.3 DB */
         if (fout->remoteVersion < 70300)
                 return;
-       /* Ignore null schema names */
-       if (schemaName == NULL || *schemaName == '\0')
-               return;
-       /* Optimize away repeated selection of same schema */
-       if (curSchemaName && strcmp(curSchemaName, schemaName) == 0)
-               return;
  
         query = createPQExpBuffer();
         appendPQExpBuffer(query, "SET search_path = %s",
@@ -14910,9 +15038,6 @@ selectSourceSchema(Archive *fout, const char *schemaName)
         ExecuteSqlStatement(fout, query->data);
  
         destroyPQExpBuffer(query);
-       if (curSchemaName)
-               free(curSchemaName);
-       curSchemaName = pg_strdup(schemaName);
  }
  
  /*
@@ -15049,34 +15174,6 @@ myFormatType(const char *typname, int32 typmod)
         return result;
  }
  
-/*
- * fmtQualifiedId - convert a qualified name to the proper format for
- * the source database.
- *
- * Like fmtId, use the result before calling again.
- */
-static const char *
-fmtQualifiedId(Archive *fout, const char *schema, const char *id)
-{
-       static PQExpBuffer id_return = NULL;
-
-       if (id_return)                          /* first time through? */
-               resetPQExpBuffer(id_return);
-       else
-               id_return = createPQExpBuffer();
-
-       /* Suppress schema name if fetching from pre-7.3 DB */
-       if (fout->remoteVersion >= 70300 && schema && *schema)
-       {
-               appendPQExpBuffer(id_return, "%s.",
-                                                 fmtId(schema));
-       }
-       appendPQExpBuffer(id_return, "%s",
-                                         fmtId(id));
-
-       return id_return->data;
-}
-
  /*
   * Return a column list clause for the given relation.
   *
@@ -15084,37 +15181,31 @@ fmtQualifiedId(Archive *fout, const char *schema, const char *id)
   * "", not an invalid "()" column list.
   */
  static const char *
-fmtCopyColumnList(const TableInfo *ti)
+fmtCopyColumnList(const TableInfo *ti, PQExpBuffer buffer)
  {
-       static PQExpBuffer q = NULL;
         int                     numatts = ti->numatts;
         char      **attnames = ti->attnames;
         bool       *attisdropped = ti->attisdropped;
         bool            needComma;
         int                     i;
  
-       if (q)                                          /* first time through? */
-               resetPQExpBuffer(q);
-       else
-               q = createPQExpBuffer();
-
-       appendPQExpBuffer(q, "(");
+       appendPQExpBuffer(buffer, "(");
         needComma = false;
         for (i = 0; i < numatts; i++)
         {
                 if (attisdropped[i])
                         continue;
                 if (needComma)
-                       appendPQExpBuffer(q, ", ");
-               appendPQExpBuffer(q, "%s", fmtId(attnames[i]));
+                       appendPQExpBuffer(buffer, ", ");
+               appendPQExpBuffer(buffer, "%s", fmtId(attnames[i]));
                 needComma = true;
         }
  
         if (!needComma)
                 return "";                              /* no undropped columns */
  
-       appendPQExpBuffer(q, ")");
-       return q->data;
+       appendPQExpBuffer(buffer, ")");
+       return buffer->data;
  }
  
  /*
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h

index 01ec27b63230e7cd7050d3c22a98f634785ea6bb..7970a359bd80ceec7bca4aa62cb625c84a351bf6 100644 (file)
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -252,6 +252,7 @@ typedef struct _tableInfo
         /* these two are set only if table is a sequence owned by a column: */
         Oid                     owning_tab;             /* OID of table owning sequence */
         int                     owning_col;             /* attr # of column owning sequence */
+       int                     relpages;
  
         bool            interesting;    /* true if need to collect more data */
  
@@ -315,6 +316,7 @@ typedef struct _indxInfo
         bool            indisclustered;
         /* if there is an associated constraint object, its dumpId: */
         DumpId          indexconstraint;
+       int                     relpages;               /* relpages of the underlying table */
  } IndxInfo;
  
  typedef struct _ruleInfo
@@ -532,6 +534,7 @@ extern void sortDumpableObjects(DumpableObject **objs, int numObjs,
                                         DumpId preBoundaryId, DumpId postBoundaryId);
  extern void sortDumpableObjectsByTypeName(DumpableObject **objs, int numObjs);
  extern void sortDumpableObjectsByTypeOid(DumpableObject **objs, int numObjs);
+extern void sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs);
  
  /*
   * version specific routines
diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c

index 2c3d850f3ddbc06f982f76cb19220b42fe3f69dc..8a6d36329df7c8a8007a485d73a957de16b96084 100644 (file)
--- a/src/bin/pg_dump/pg_dump_sort.c
+++ b/src/bin/pg_dump/pg_dump_sort.c
@@ -143,6 +143,96 @@ static void repairDependencyLoop(DumpableObject **loop,
  static void describeDumpableObject(DumpableObject *obj,
                                            char *buf, int bufsize);
  
+static int     DOSizeCompare(const void *p1, const void *p2);
+
+static int
+findFirstEqualType(DumpableObjectType type, DumpableObject **objs, int numObjs)
+{
+       int                     i;
+
+       for (i = 0; i < numObjs; i++)
+               if (objs[i]->objType == type)
+                       return i;
+       return -1;
+}
+
+static int
+findFirstDifferentType(DumpableObjectType type, DumpableObject **objs, int numObjs, int start)
+{
+       int                     i;
+
+       for (i = start; i < numObjs; i++)
+               if (objs[i]->objType != type)
+                       return i;
+       return numObjs - 1;
+}
+
+/*
+ * When we do a parallel dump, we want to start with the largest items first.
+ *
+ * Say we have the objects in this order:
+ * ....DDDDD....III....
+ *
+ * with D = Table data, I = Index, . = other object
+ *
+ * This sorting function now takes each of the D or I blocks and sorts them
+ * according to their size.
+ */
+void
+sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs)
+{
+       int                     startIdx,
+                               endIdx;
+       void       *startPtr;
+
+       if (numObjs <= 1)
+               return;
+
+       startIdx = findFirstEqualType(DO_TABLE_DATA, objs, numObjs);
+       if (startIdx >= 0)
+       {
+               endIdx = findFirstDifferentType(DO_TABLE_DATA, objs, numObjs, startIdx);
+               startPtr = objs + startIdx;
+               qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *),
+                         DOSizeCompare);
+       }
+
+       startIdx = findFirstEqualType(DO_INDEX, objs, numObjs);
+       if (startIdx >= 0)
+       {
+               endIdx = findFirstDifferentType(DO_INDEX, objs, numObjs, startIdx);
+               startPtr = objs + startIdx;
+               qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *),
+                         DOSizeCompare);
+       }
+}
+
+static int
+DOSizeCompare(const void *p1, const void *p2)
+{
+       DumpableObject *obj1 = *(DumpableObject **) p1;
+       DumpableObject *obj2 = *(DumpableObject **) p2;
+       int                     obj1_size = 0;
+       int                     obj2_size = 0;
+
+       if (obj1->objType == DO_TABLE_DATA)
+               obj1_size = ((TableDataInfo *) obj1)->tdtable->relpages;
+       if (obj1->objType == DO_INDEX)
+               obj1_size = ((IndxInfo *) obj1)->relpages;
+
+       if (obj2->objType == DO_TABLE_DATA)
+               obj2_size = ((TableDataInfo *) obj2)->tdtable->relpages;
+       if (obj2->objType == DO_INDEX)
+               obj2_size = ((IndxInfo *) obj2)->relpages;
+
+       /* we want to see the biggest item go first */
+       if (obj1_size > obj2_size)
+               return -1;
+       if (obj2_size > obj1_size)
+               return 1;
+
+       return 0;
+}
  
  /*
   * Sort the given objects into a type/name-based ordering
@@ -735,7 +825,7 @@ repairViewRuleMultiLoop(DumpableObject *viewobj,
         /* remove view's dependency on rule */
         removeObjectDependency(viewobj, ruleobj->dumpId);
         /* pretend view is a plain table and dump it that way */
-       viewinfo->relkind = 'r';                /* RELKIND_RELATION */
+       viewinfo->relkind = 'r';        /* RELKIND_RELATION */
         /* mark rule as needing its own dump */
         ruleinfo->separate = true;
         /* move any reloptions from view to rule */
diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c

index 5488021bcae0f9a52d04a62ffed3e91c5f115991..b26aa99f03f1028b6679e36fc96db29587a357c9 100644 (file)
--- a/src/bin/pg_dump/pg_dumpall.c
+++ b/src/bin/pg_dump/pg_dumpall.c
@@ -657,8 +657,8 @@ dumpRoles(PGconn *conn)
                                                   "rolcreaterole, rolcreatedb, "
                                                   "rolcanlogin, rolconnlimit, rolpassword, "
                                                   "rolvaliduntil, rolreplication, "
-                         "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, "
-                                                 "rolname = current_user AS is_current_user "
+                        "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, "
+                                                 "rolname = current_user AS is_current_user "
                                                   "FROM pg_authid "
                                                   "ORDER BY 2");
         else if (server_version >= 80200)
@@ -667,8 +667,8 @@ dumpRoles(PGconn *conn)
                                                   "rolcreaterole, rolcreatedb, "
                                                   "rolcanlogin, rolconnlimit, rolpassword, "
                                                   "rolvaliduntil, false as rolreplication, "
-                         "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, "
-                                                 "rolname = current_user AS is_current_user "
+                        "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, "
+                                                 "rolname = current_user AS is_current_user "
                                                   "FROM pg_authid "
                                                   "ORDER BY 2");
         else if (server_version >= 80100)
@@ -678,7 +678,7 @@ dumpRoles(PGconn *conn)
                                                   "rolcanlogin, rolconnlimit, rolpassword, "
                                                   "rolvaliduntil, false as rolreplication, "
                                                   "null as rolcomment, "
-                                                 "rolname = current_user AS is_current_user "
+                                                 "rolname = current_user AS is_current_user "
                                                   "FROM pg_authid "
                                                   "ORDER BY 2");
         else
@@ -694,7 +694,7 @@ dumpRoles(PGconn *conn)
                                                   "valuntil as rolvaliduntil, "
                                                   "false as rolreplication, "
                                                   "null as rolcomment, "
-                                                 "rolname = current_user AS is_current_user "
+                                                 "rolname = current_user AS is_current_user "
                                                   "FROM pg_shadow "
                                                   "UNION ALL "
                                                   "SELECT 0, groname as rolname, "
@@ -755,7 +755,7 @@ dumpRoles(PGconn *conn)
                  * will acquire the right properties even if it already exists (ie, it
                  * won't hurt for the CREATE to fail).  This is particularly important
                  * for the role we are connected as, since even with --clean we will
-                * have failed to drop it.  binary_upgrade cannot generate any errors,
+                * have failed to drop it.      binary_upgrade cannot generate any errors,
                  * so we assume the current role is already created.
                  */
                 if (!binary_upgrade ||
@@ -1857,8 +1857,8 @@ connectDatabase(const char *dbname, const char *connection_string,
         }
  
         /*
-        * Ok, connected successfully. Remember the options used, in the form of
-        * a connection string.
+        * Ok, connected successfully. Remember the options used, in the form of a
+        * connection string.
          */
         connstr = constructConnStr(keywords, values);
  
@@ -2039,7 +2039,7 @@ static void
  doConnStrQuoting(PQExpBuffer buf, const char *str)
  {
         const char *s;
-       bool needquotes;
+       bool            needquotes;
  
         /*
          * If the string consists entirely of plain ASCII characters, no need to
diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c

index 5dbe98f7145766b1a5af7e3905e49172c045bea5..0cc17fd416e56138c709751f715e3f5463dbf3b9 100644 (file)
--- a/src/bin/pg_dump/pg_restore.c
+++ b/src/bin/pg_dump/pg_restore.c
@@ -71,6 +71,7 @@ main(int argc, char **argv)
         RestoreOptions *opts;
         int                     c;
         int                     exit_code;
+       int                     numWorkers = 1;
         Archive    *AH;
         char       *inputFileSpec;
         static int      disable_triggers = 0;
@@ -182,7 +183,7 @@ main(int argc, char **argv)
                                 break;
  
                         case 'j':                       /* number of restore jobs */
-                               opts->number_of_jobs = atoi(optarg);
+                               numWorkers = atoi(optarg);
                                 break;
  
                         case 'l':                       /* Dump the TOC summary */
@@ -313,7 +314,7 @@ main(int argc, char **argv)
         }
  
         /* Can't do single-txn mode with multiple connections */
-       if (opts->single_txn && opts->number_of_jobs > 1)
+       if (opts->single_txn && numWorkers > 1)
         {
                 fprintf(stderr, _("%s: cannot specify both --single-transaction and multiple jobs\n"),
                                 progname);
@@ -372,6 +373,18 @@ main(int argc, char **argv)
         if (opts->tocFile)
                 SortTocFromFile(AH, opts);
  
+       /* See comments in pg_dump.c */
+#ifdef WIN32
+       if (numWorkers > MAXIMUM_WAIT_OBJECTS)
+       {
+               fprintf(stderr, _("%s: maximum number of parallel jobs is %d\n"),
+                               progname, MAXIMUM_WAIT_OBJECTS);
+               exit(1);
+       }
+#endif
+
+       AH->numWorkers = numWorkers;
+
         if (opts->tocSummary)
                 PrintTOCSummary(AH, opts);
         else
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm

index 5bf0e5949b7192e20fadcb522a99bcb10a58c8bb..a4bd2b62107e7597f2cbfb210d4755fb973b268e 100644 (file)
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -395,6 +395,7 @@ sub mkvcbuild
         $psql->AddIncludeDir('src\bin\pg_dump');
         $psql->AddIncludeDir('src\backend');
         $psql->AddFile('src\bin\psql\psqlscan.l');
+       $psql->AddLibrary('ws2_32.lib');
  
         my $pgdump = AddSimpleFrontend('pg_dump', 1);
         $pgdump->AddIncludeDir('src\backend');
@@ -403,6 +404,7 @@ sub mkvcbuild
         $pgdump->AddFile('src\bin\pg_dump\pg_dump_sort.c');
         $pgdump->AddFile('src\bin\pg_dump\keywords.c');
         $pgdump->AddFile('src\backend\parser\kwlookup.c');
+       $pgdump->AddLibrary('ws2_32.lib');
  
         my $pgdumpall = AddSimpleFrontend('pg_dump', 1);
  
@@ -419,6 +421,7 @@ sub mkvcbuild
         $pgdumpall->AddFile('src\bin\pg_dump\dumputils.c');
         $pgdumpall->AddFile('src\bin\pg_dump\keywords.c');
         $pgdumpall->AddFile('src\backend\parser\kwlookup.c');
+       $pgdumpall->AddLibrary('ws2_32.lib');
  
         my $pgrestore = AddSimpleFrontend('pg_dump', 1);
         $pgrestore->{name} = 'pg_restore';
@@ -426,6 +429,7 @@ sub mkvcbuild
         $pgrestore->AddFile('src\bin\pg_dump\pg_restore.c');
         $pgrestore->AddFile('src\bin\pg_dump\keywords.c');
         $pgrestore->AddFile('src\backend\parser\kwlookup.c');
+       $pgrestore->AddLibrary('ws2_32.lib');
  
         my $zic = $solution->AddProject('zic', 'exe', 'utils');
         $zic->AddFiles('src\timezone', 'zic.c', 'ialloc.c', 'scheck.c',
@@ -572,6 +576,7 @@ sub mkvcbuild
                 $proj->AddIncludeDir('src\bin\psql');
                 $proj->AddReference($libpq, $libpgport, $libpgcommon);
                 $proj->AddResourceFile('src\bin\scripts', 'PostgreSQL Utility');
+               $proj->AddLibrary('ws2_32.lib');
         }
  
         # Regression DLL and EXE
author	Andrew Dunstan <andrew@dunslane.net>
	Sun, 24 Mar 2013 15:27:20 +0000 (11:27 -0400)
committer	Andrew Dunstan <andrew@dunslane.net>
	Sun, 24 Mar 2013 15:27:20 +0000 (11:27 -0400)
doc/src/sgml/backup.sgml		patch \| blob \| history
doc/src/sgml/perform.sgml		patch \| blob \| history
doc/src/sgml/ref/pg_dump.sgml		patch \| blob \| history
src/bin/pg_dump/Makefile		patch \| blob \| history
src/bin/pg_dump/compress_io.c		patch \| blob \| history
src/bin/pg_dump/dumputils.c		patch \| blob \| history
src/bin/pg_dump/dumputils.h		patch \| blob \| history
src/bin/pg_dump/parallel.c	[new file with mode: 0644]	patch \| blob
src/bin/pg_dump/parallel.h	[new file with mode: 0644]	patch \| blob
src/bin/pg_dump/pg_backup.h		patch \| blob \| history
src/bin/pg_dump/pg_backup_archiver.c		patch \| blob \| history
src/bin/pg_dump/pg_backup_archiver.h		patch \| blob \| history
src/bin/pg_dump/pg_backup_custom.c		patch \| blob \| history
src/bin/pg_dump/pg_backup_db.c		patch \| blob \| history
src/bin/pg_dump/pg_backup_directory.c		patch \| blob \| history
src/bin/pg_dump/pg_backup_tar.c		patch \| blob \| history
src/bin/pg_dump/pg_dump.c		patch \| blob \| history
src/bin/pg_dump/pg_dump.h		patch \| blob \| history
src/bin/pg_dump/pg_dump_sort.c		patch \| blob \| history
src/bin/pg_dump/pg_dumpall.c		patch \| blob \| history
src/bin/pg_dump/pg_restore.c		patch \| blob \| history
src/tools/msvc/Mkvcbuild.pm		patch \| blob \| history