]> granicus.if.org Git - postgresql/commitdiff
Improve parallel scheduling logic in pg_dump/pg_restore.
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 14 Sep 2018 21:31:51 +0000 (17:31 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 14 Sep 2018 21:31:51 +0000 (17:31 -0400)
Previously, the way this worked was that a parallel pg_dump would
re-order the TABLE_DATA items in the dump's TOC into decreasing size
order, and separately re-order (some of) the INDEX items into decreasing
size order.  Then pg_dump would dump the items in that order.  Later,
parallel pg_restore just followed the TOC order.  This method had lots
of deficiencies:

* TOC ordering randomly differed between parallel and non-parallel
dumps, and was hard to predict in the former case, causing problems
for building stable pg_dump test cases.

* Parallel restore only followed a well-chosen order if the dump had
been done in parallel; in particular, this never happened for restore
from custom-format dumps.

* The best order for restore isn't necessarily the same as for dump,
and it's not really static either because of locking considerations.

* TABLE_DATA and INDEX items aren't the only things that might take a lot
of work during restore.  Scheduling was particularly stupid for the BLOBS
item, which might require lots of work during dump as well as restore,
but was left to the end in either case.

This patch removes the logic that changed the TOC order, fixing the
test instability problem.  Instead, we sort the parallelizable items
just before processing them during a parallel dump.  Independently
of that, parallel restore prioritizes the ready-to-execute tasks
based on the size of the underlying table.  In the case of dependent
tasks such as index, constraint, or foreign key creation, the largest
relevant table is used as the metric for estimating the task length.
(This is pretty crude, but it should be enough to avoid the case we
want to avoid, which is ending the run with just a few large tasks
such that we can't make use of all N workers.)

Patch by me, responding to a complaint from Peter Eisentraut,
who also reviewed the patch.

Discussion: https://postgr.es/m/5137fe12-d0a2-4971-61b6-eb4e7e8875f8@2ndquadrant.com

src/bin/pg_dump/pg_backup.h
src/bin/pg_dump/pg_backup_archiver.c
src/bin/pg_dump/pg_backup_archiver.h
src/bin/pg_dump/pg_backup_custom.c
src/bin/pg_dump/pg_backup_directory.c
src/bin/pg_dump/pg_dump.c
src/bin/pg_dump/pg_dump.h
src/bin/pg_dump/pg_dump_sort.c

index 42cf441aaf67344f167d458fcd0a45fe75e77f81..ba798213be99c369dfaa7e3f197920ea8f4e9480 100644 (file)
@@ -252,18 +252,6 @@ extern void ConnectDatabase(Archive *AH,
 extern void DisconnectDatabase(Archive *AHX);
 extern PGconn *GetConnection(Archive *AHX);
 
-/* Called to add a TOC entry */
-extern void ArchiveEntry(Archive *AHX,
-                        CatalogId catalogId, DumpId dumpId,
-                        const char *tag,
-                        const char *namespace, const char *tablespace,
-                        const char *owner, bool withOids,
-                        const char *desc, teSection section,
-                        const char *defn,
-                        const char *dropStmt, const char *copyStmt,
-                        const DumpId *deps, int nDeps,
-                        DataDumperPtr dumpFn, void *dumpArg);
-
 /* Called to write *data* to the archive */
 extern void WriteData(Archive *AH, const void *data, size_t dLen);
 
index 36e3383b851a5d164566551923e37573f8e625b6..3f7a658bcec0d7e96ec3d941ec57eeda317f7e2e 100644 (file)
@@ -49,6 +49,24 @@ typedef struct _outputContext
        int                     gzOut;
 } OutputContext;
 
+/*
+ * State for tracking TocEntrys that are ready to process during a parallel
+ * restore.  (This used to be a list, and we still call it that, though now
+ * it's really an array so that we can apply qsort to it.)
+ *
+ * tes[] is sized large enough that we can't overrun it.
+ * The valid entries are indexed first_te .. last_te inclusive.
+ * We periodically sort the array to bring larger-by-dataLength entries to
+ * the front; "sorted" is true if the valid entries are known sorted.
+ */
+typedef struct _parallelReadyList
+{
+       TocEntry  **tes;                        /* Ready-to-dump TocEntrys */
+       int                     first_te;               /* index of first valid entry in tes[] */
+       int                     last_te;                /* index of last valid entry in tes[] */
+       bool            sorted;                 /* are valid entries currently sorted? */
+} ParallelReadyList;
+
 /* translator: this is a module name */
 static const char *modulename = gettext_noop("archiver");
 
@@ -95,13 +113,20 @@ static void restore_toc_entries_parallel(ArchiveHandle *AH,
                                                         TocEntry *pending_list);
 static void restore_toc_entries_postfork(ArchiveHandle *AH,
                                                         TocEntry *pending_list);
-static void par_list_header_init(TocEntry *l);
-static void par_list_append(TocEntry *l, TocEntry *te);
-static void par_list_remove(TocEntry *te);
-static void move_to_ready_list(TocEntry *pending_list, TocEntry *ready_list,
+static void pending_list_header_init(TocEntry *l);
+static void pending_list_append(TocEntry *l, TocEntry *te);
+static void pending_list_remove(TocEntry *te);
+static void ready_list_init(ParallelReadyList *ready_list, int tocCount);
+static void ready_list_free(ParallelReadyList *ready_list);
+static void ready_list_insert(ParallelReadyList *ready_list, TocEntry *te);
+static void ready_list_remove(ParallelReadyList *ready_list, int i);
+static void ready_list_sort(ParallelReadyList *ready_list);
+static int     TocEntrySizeCompare(const void *p1, const void *p2);
+static void move_to_ready_list(TocEntry *pending_list,
+                                  ParallelReadyList *ready_list,
                                   RestorePass pass);
-static TocEntry *get_next_work_item(ArchiveHandle *AH,
-                                  TocEntry *ready_list,
+static TocEntry *pop_next_work_item(ArchiveHandle *AH,
+                                  ParallelReadyList *ready_list,
                                   ParallelState *pstate);
 static void mark_dump_job_done(ArchiveHandle *AH,
                                   TocEntry *te,
@@ -116,7 +141,7 @@ static bool has_lock_conflicts(TocEntry *te1, TocEntry *te2);
 static void repoint_table_dependencies(ArchiveHandle *AH);
 static void identify_locking_dependencies(ArchiveHandle *AH, TocEntry *te);
 static void reduce_dependencies(ArchiveHandle *AH, TocEntry *te,
-                                       TocEntry *ready_list);
+                                       ParallelReadyList *ready_list);
 static void mark_create_done(ArchiveHandle *AH, TocEntry *te);
 static void inhibit_data_for_failed_table(ArchiveHandle *AH, TocEntry *te);
 
@@ -639,7 +664,11 @@ RestoreArchive(Archive *AHX)
                ParallelState *pstate;
                TocEntry        pending_list;
 
-               par_list_header_init(&pending_list);
+               /* The archive format module may need some setup for this */
+               if (AH->PrepParallelRestorePtr)
+                       AH->PrepParallelRestorePtr(AH);
+
+               pending_list_header_init(&pending_list);
 
                /* This runs PRE_DATA items and then disconnects from the database */
                restore_toc_entries_prefork(AH, &pending_list);
@@ -1039,10 +1068,14 @@ WriteData(Archive *AHX, const void *data, size_t dLen)
 /*
  * Create a new TOC entry. The TOC was designed as a TOC, but is now the
  * repository for all metadata. But the name has stuck.
+ *
+ * The new entry is added to the Archive's TOC list.  Most callers can ignore
+ * the result value because nothing else need be done, but a few want to
+ * manipulate the TOC entry further.
  */
 
 /* Public */
-void
+TocEntry *
 ArchiveEntry(Archive *AHX,
                         CatalogId catalogId, DumpId dumpId,
                         const char *tag,
@@ -1100,9 +1133,12 @@ ArchiveEntry(Archive *AHX,
        newToc->hadDumper = dumpFn ? true : false;
 
        newToc->formatData = NULL;
+       newToc->dataLength = 0;
 
        if (AH->ArchiveEntryPtr != NULL)
                AH->ArchiveEntryPtr(AH, newToc);
+
+       return newToc;
 }
 
 /* Public */
@@ -2413,32 +2449,59 @@ WriteDataChunks(ArchiveHandle *AH, ParallelState *pstate)
 {
        TocEntry   *te;
 
-       for (te = AH->toc->next; te != AH->toc; te = te->next)
+       if (pstate && pstate->numWorkers > 1)
        {
-               if (!te->dataDumper)
-                       continue;
-
-               if ((te->reqs & REQ_DATA) == 0)
-                       continue;
+               /*
+                * In parallel mode, this code runs in the master process.  We
+                * construct an array of candidate TEs, then sort it into decreasing
+                * size order, then dispatch each TE to a data-transfer worker.  By
+                * dumping larger tables first, we avoid getting into a situation
+                * where we're down to one job and it's big, losing parallelism.
+                */
+               TocEntry  **tes;
+               int                     ntes;
 
-               if (pstate && pstate->numWorkers > 1)
+               tes = (TocEntry **) pg_malloc(AH->tocCount * sizeof(TocEntry *));
+               ntes = 0;
+               for (te = AH->toc->next; te != AH->toc; te = te->next)
                {
-                       /*
-                        * If we are in a parallel backup, then we are always the master
-                        * process.  Dispatch each data-transfer job to a worker.
-                        */
-                       DispatchJobForTocEntry(AH, pstate, te, ACT_DUMP,
-                                                                  mark_dump_job_done, NULL);
+                       /* Consider only TEs with dataDumper functions ... */
+                       if (!te->dataDumper)
+                               continue;
+                       /* ... and ignore ones not enabled for dump */
+                       if ((te->reqs & REQ_DATA) == 0)
+                               continue;
+
+                       tes[ntes++] = te;
                }
-               else
-                       WriteDataChunksForTocEntry(AH, te);
-       }
 
-       /*
-        * If parallel, wait for workers to finish.
-        */
-       if (pstate && pstate->numWorkers > 1)
+               if (ntes > 1)
+                       qsort((void *) tes, ntes, sizeof(TocEntry *),
+                                 TocEntrySizeCompare);
+
+               for (int i = 0; i < ntes; i++)
+                       DispatchJobForTocEntry(AH, pstate, tes[i], ACT_DUMP,
+                                                                  mark_dump_job_done, NULL);
+
+               pg_free(tes);
+
+               /* Now wait for workers to finish. */
                WaitForWorkers(AH, pstate, WFW_ALL_IDLE);
+       }
+       else
+       {
+               /* Non-parallel mode: just dump all candidate TEs sequentially. */
+               for (te = AH->toc->next; te != AH->toc; te = te->next)
+               {
+                       /* Must have same filter conditions as above */
+                       if (!te->dataDumper)
+                               continue;
+                       if ((te->reqs & REQ_DATA) == 0)
+                               continue;
+
+                       WriteDataChunksForTocEntry(AH, te);
+               }
+       }
 }
 
 
@@ -2690,6 +2753,7 @@ ReadToc(ArchiveHandle *AH)
                        te->dependencies = NULL;
                        te->nDeps = 0;
                }
+               te->dataLength = 0;
 
                if (AH->ReadExtraTocPtr)
                        AH->ReadExtraTocPtr(AH, te);
@@ -3996,7 +4060,7 @@ restore_toc_entries_prefork(ArchiveHandle *AH, TocEntry *pending_list)
                else
                {
                        /* Nope, so add it to pending_list */
-                       par_list_append(pending_list, next_work_item);
+                       pending_list_append(pending_list, next_work_item);
                }
        }
 
@@ -4035,11 +4099,14 @@ static void
 restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
                                                         TocEntry *pending_list)
 {
-       TocEntry        ready_list;
+       ParallelReadyList ready_list;
        TocEntry   *next_work_item;
 
        ahlog(AH, 2, "entering restore_toc_entries_parallel\n");
 
+       /* Set up ready_list with enough room for all known TocEntrys */
+       ready_list_init(&ready_list, AH->tocCount);
+
        /*
         * The pending_list contains all items that we need to restore.  Move all
         * items that are available to process immediately into the ready_list.
@@ -4048,7 +4115,6 @@ restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
         * contains items that have no remaining dependencies and are OK to
         * process in the current restore pass.
         */
-       par_list_header_init(&ready_list);
        AH->restorePass = RESTORE_PASS_MAIN;
        move_to_ready_list(pending_list, &ready_list, AH->restorePass);
 
@@ -4064,7 +4130,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
        for (;;)
        {
                /* Look for an item ready to be dispatched to a worker */
-               next_work_item = get_next_work_item(AH, &ready_list, pstate);
+               next_work_item = pop_next_work_item(AH, &ready_list, pstate);
                if (next_work_item != NULL)
                {
                        /* If not to be restored, don't waste time launching a worker */
@@ -4073,8 +4139,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
                                ahlog(AH, 1, "skipping item %d %s %s\n",
                                          next_work_item->dumpId,
                                          next_work_item->desc, next_work_item->tag);
-                               /* Drop it from ready_list, and update its dependencies */
-                               par_list_remove(next_work_item);
+                               /* Update its dependencies as though we'd completed it */
                                reduce_dependencies(AH, next_work_item, &ready_list);
                                /* Loop around to see if anything else can be dispatched */
                                continue;
@@ -4084,9 +4149,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
                                  next_work_item->dumpId,
                                  next_work_item->desc, next_work_item->tag);
 
-                       /* Remove it from ready_list, and dispatch to some worker */
-                       par_list_remove(next_work_item);
-
+                       /* Dispatch to some worker */
                        DispatchJobForTocEntry(AH, pstate, next_work_item, ACT_RESTORE,
                                                                   mark_restore_job_done, &ready_list);
                }
@@ -4132,7 +4195,9 @@ restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate,
        }
 
        /* There should now be nothing in ready_list. */
-       Assert(ready_list.par_next == &ready_list);
+       Assert(ready_list.first_te > ready_list.last_te);
+
+       ready_list_free(&ready_list);
 
        ahlog(AH, 1, "finished main parallel loop\n");
 }
@@ -4170,7 +4235,7 @@ restore_toc_entries_postfork(ArchiveHandle *AH, TocEntry *pending_list)
         * connection.  We don't sweat about RestorePass ordering; it's likely we
         * already violated that.
         */
-       for (te = pending_list->par_next; te != pending_list; te = te->par_next)
+       for (te = pending_list->pending_next; te != pending_list; te = te->pending_next)
        {
                ahlog(AH, 1, "processing missed item %d %s %s\n",
                          te->dumpId, te->desc, te->tag);
@@ -4201,36 +4266,130 @@ has_lock_conflicts(TocEntry *te1, TocEntry *te2)
 
 
 /*
- * Initialize the header of a parallel-processing list.
+ * Initialize the header of the pending-items list.
  *
- * These are circular lists with a dummy TocEntry as header, just like the
+ * This is a circular list with a dummy TocEntry as header, just like the
  * main TOC list; but we use separate list links so that an entry can be in
- * the main TOC list as well as in a parallel-processing list.
+ * the main TOC list as well as in the pending list.
+ */
+static void
+pending_list_header_init(TocEntry *l)
+{
+       l->pending_prev = l->pending_next = l;
+}
+
+/* Append te to the end of the pending-list headed by l */
+static void
+pending_list_append(TocEntry *l, TocEntry *te)
+{
+       te->pending_prev = l->pending_prev;
+       l->pending_prev->pending_next = te;
+       l->pending_prev = te;
+       te->pending_next = l;
+}
+
+/* Remove te from the pending-list */
+static void
+pending_list_remove(TocEntry *te)
+{
+       te->pending_prev->pending_next = te->pending_next;
+       te->pending_next->pending_prev = te->pending_prev;
+       te->pending_prev = NULL;
+       te->pending_next = NULL;
+}
+
+
+/*
+ * Initialize the ready_list with enough room for up to tocCount entries.
  */
 static void
-par_list_header_init(TocEntry *l)
+ready_list_init(ParallelReadyList *ready_list, int tocCount)
 {
-       l->par_prev = l->par_next = l;
+       ready_list->tes = (TocEntry **)
+               pg_malloc(tocCount * sizeof(TocEntry *));
+       ready_list->first_te = 0;
+       ready_list->last_te = -1;
+       ready_list->sorted = false;
 }
 
-/* Append te to the end of the parallel-processing list headed by l */
+/*
+ * Free storage for a ready_list.
+ */
+static void
+ready_list_free(ParallelReadyList *ready_list)
+{
+       pg_free(ready_list->tes);
+}
+
+/* Add te to the ready_list */
 static void
-par_list_append(TocEntry *l, TocEntry *te)
+ready_list_insert(ParallelReadyList *ready_list, TocEntry *te)
 {
-       te->par_prev = l->par_prev;
-       l->par_prev->par_next = te;
-       l->par_prev = te;
-       te->par_next = l;
+       ready_list->tes[++ready_list->last_te] = te;
+       /* List is (probably) not sorted anymore. */
+       ready_list->sorted = false;
+}
+
+/* Remove the i'th entry in the ready_list */
+static void
+ready_list_remove(ParallelReadyList *ready_list, int i)
+{
+       int                     f = ready_list->first_te;
+
+       Assert(i >= f && i <= ready_list->last_te);
+
+       /*
+        * In the typical case where the item to be removed is the first ready
+        * entry, we need only increment first_te to remove it.  Otherwise, move
+        * the entries before it to compact the list.  (This preserves sortedness,
+        * if any.)  We could alternatively move the entries after i, but there
+        * are typically many more of those.
+        */
+       if (i > f)
+       {
+               TocEntry  **first_te_ptr = &ready_list->tes[f];
+
+               memmove(first_te_ptr + 1, first_te_ptr, (i - f) * sizeof(TocEntry *));
+       }
+       ready_list->first_te++;
 }
 
-/* Remove te from whatever parallel-processing list it's in */
+/* Sort the ready_list into the desired order */
 static void
-par_list_remove(TocEntry *te)
+ready_list_sort(ParallelReadyList *ready_list)
 {
-       te->par_prev->par_next = te->par_next;
-       te->par_next->par_prev = te->par_prev;
-       te->par_prev = NULL;
-       te->par_next = NULL;
+       if (!ready_list->sorted)
+       {
+               int                     n = ready_list->last_te - ready_list->first_te + 1;
+
+               if (n > 1)
+                       qsort(ready_list->tes + ready_list->first_te, n,
+                                 sizeof(TocEntry *),
+                                 TocEntrySizeCompare);
+               ready_list->sorted = true;
+       }
+}
+
+/* qsort comparator for sorting TocEntries by dataLength */
+static int
+TocEntrySizeCompare(const void *p1, const void *p2)
+{
+       const TocEntry *te1 = *(const TocEntry *const *) p1;
+       const TocEntry *te2 = *(const TocEntry *const *) p2;
+
+       /* Sort by decreasing dataLength */
+       if (te1->dataLength > te2->dataLength)
+               return -1;
+       if (te1->dataLength < te2->dataLength)
+               return 1;
+
+       /* For equal dataLengths, sort by dumpId, just to be stable */
+       if (te1->dumpId < te2->dumpId)
+               return -1;
+       if (te1->dumpId > te2->dumpId)
+               return 1;
+
+       return 0;
 }
 
 
@@ -4242,52 +4401,50 @@ par_list_remove(TocEntry *te)
  * which applies the same logic one-at-a-time.)
  */
 static void
-move_to_ready_list(TocEntry *pending_list, TocEntry *ready_list,
+move_to_ready_list(TocEntry *pending_list,
+                                  ParallelReadyList *ready_list,
                                   RestorePass pass)
 {
        TocEntry   *te;
        TocEntry   *next_te;
 
-       for (te = pending_list->par_next; te != pending_list; te = next_te)
+       for (te = pending_list->pending_next; te != pending_list; te = next_te)
        {
-               /* must save list link before possibly moving te to other list */
-               next_te = te->par_next;
+               /* must save list link before possibly removing te from list */
+               next_te = te->pending_next;
 
                if (te->depCount == 0 &&
                        _tocEntryRestorePass(te) == pass)
                {
                        /* Remove it from pending_list ... */
-                       par_list_remove(te);
+                       pending_list_remove(te);
                        /* ... and add to ready_list */
-                       par_list_append(ready_list, te);
+                       ready_list_insert(ready_list, te);
                }
        }
 }
 
 /*
- * Find the next work item (if any) that is capable of being run now.
+ * Find the next work item (if any) that is capable of being run now,
+ * and remove it from the ready_list.
+ *
+ * Returns the item, or NULL if nothing is runnable.
  *
  * To qualify, the item must have no remaining dependencies
  * and no requirements for locks that are incompatible with
  * items currently running.  Items in the ready_list are known to have
  * no remaining dependencies, but we have to check for lock conflicts.
  *
- * Note that the returned item has *not* been removed from ready_list.
- * The caller must do that after successfully dispatching the item.
- *
  * pref_non_data is for an alternative selection algorithm that gives
  * preference to non-data items if there is already a data load running.
  * It is currently disabled.
  */
 static TocEntry *
-get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
+pop_next_work_item(ArchiveHandle *AH, ParallelReadyList *ready_list,
                                   ParallelState *pstate)
 {
        bool            pref_non_data = false;  /* or get from AH->ropt */
-       TocEntry   *data_te = NULL;
-       TocEntry   *te;
-       int                     i,
-                               k;
+       int                     data_te_index = -1;
 
        /*
         * Bogus heuristics for pref_non_data
@@ -4296,7 +4453,7 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
        {
                int                     count = 0;
 
-               for (k = 0; k < pstate->numWorkers; k++)
+               for (int k = 0; k < pstate->numWorkers; k++)
                {
                        TocEntry   *running_te = pstate->te[k];
 
@@ -4308,11 +4465,17 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
                        pref_non_data = false;
        }
 
+       /*
+        * Sort the ready_list so that we'll tackle larger jobs first.
+        */
+       ready_list_sort(ready_list);
+
        /*
         * Search the ready_list until we find a suitable item.
         */
-       for (te = ready_list->par_next; te != ready_list; te = te->par_next)
+       for (int i = ready_list->first_te; i <= ready_list->last_te; i++)
        {
+               TocEntry   *te = ready_list->tes[i];
                bool            conflicts = false;
 
                /*
@@ -4320,9 +4483,9 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
                 * that a currently running item also needs lock on, or vice versa. If
                 * so, we don't want to schedule them together.
                 */
-               for (i = 0; i < pstate->numWorkers; i++)
+               for (int k = 0; k < pstate->numWorkers; k++)
                {
-                       TocEntry   *running_te = pstate->te[i];
+                       TocEntry   *running_te = pstate->te[k];
 
                        if (running_te == NULL)
                                continue;
@@ -4339,17 +4502,23 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list,
 
                if (pref_non_data && te->section == SECTION_DATA)
                {
-                       if (data_te == NULL)
-                               data_te = te;
+                       if (data_te_index < 0)
+                               data_te_index = i;
                        continue;
                }
 
                /* passed all tests, so this item can run */
+               ready_list_remove(ready_list, i);
                return te;
        }
 
-       if (data_te != NULL)
+       if (data_te_index >= 0)
+       {
+               TocEntry   *data_te = ready_list->tes[data_te_index];
+
+               ready_list_remove(ready_list, data_te_index);
                return data_te;
+       }
 
        ahlog(AH, 2, "no item ready\n");
        return NULL;
@@ -4393,7 +4562,7 @@ mark_restore_job_done(ArchiveHandle *AH,
                                          int status,
                                          void *callback_data)
 {
-       TocEntry   *ready_list = (TocEntry *) callback_data;
+       ParallelReadyList *ready_list = (ParallelReadyList *) callback_data;
 
        ahlog(AH, 1, "finished item %d %s %s\n",
                  te->dumpId, te->desc, te->tag);
@@ -4443,8 +4612,8 @@ fix_dependencies(ArchiveHandle *AH)
                te->depCount = te->nDeps;
                te->revDeps = NULL;
                te->nRevDeps = 0;
-               te->par_prev = NULL;
-               te->par_next = NULL;
+               te->pending_prev = NULL;
+               te->pending_next = NULL;
        }
 
        /*
@@ -4551,6 +4720,12 @@ fix_dependencies(ArchiveHandle *AH)
 /*
  * Change dependencies on table items to depend on table data items instead,
  * but only in POST_DATA items.
+ *
+ * Also, for any item having such dependency(s), set its dataLength to the
+ * largest dataLength of the table data items it depends on.  This ensures
+ * that parallel restore will prioritize larger jobs (index builds, FK
+ * constraint checks, etc) over smaller ones, avoiding situations where we
+ * end a restore with only one active job working on a large table.
  */
 static void
 repoint_table_dependencies(ArchiveHandle *AH)
@@ -4569,9 +4744,13 @@ repoint_table_dependencies(ArchiveHandle *AH)
                        if (olddep <= AH->maxDumpId &&
                                AH->tableDataId[olddep] != 0)
                        {
-                               te->dependencies[i] = AH->tableDataId[olddep];
+                               DumpId          tabledataid = AH->tableDataId[olddep];
+                               TocEntry   *tabledatate = AH->tocsByDumpId[tabledataid];
+
+                               te->dependencies[i] = tabledataid;
+                               te->dataLength = Max(te->dataLength, tabledatate->dataLength);
                                ahlog(AH, 2, "transferring dependency %d -> %d to %d\n",
-                                         te->dumpId, olddep, AH->tableDataId[olddep]);
+                                         te->dumpId, olddep, tabledataid);
                        }
                }
        }
@@ -4647,7 +4826,8 @@ identify_locking_dependencies(ArchiveHandle *AH, TocEntry *te)
  * becomes ready should be moved to the ready_list, if that's provided.
  */
 static void
-reduce_dependencies(ArchiveHandle *AH, TocEntry *te, TocEntry *ready_list)
+reduce_dependencies(ArchiveHandle *AH, TocEntry *te,
+                                       ParallelReadyList *ready_list)
 {
        int                     i;
 
@@ -4670,13 +4850,13 @@ reduce_dependencies(ArchiveHandle *AH, TocEntry *te, TocEntry *ready_list)
                 */
                if (otherte->depCount == 0 &&
                        _tocEntryRestorePass(otherte) == AH->restorePass &&
-                       otherte->par_prev != NULL &&
+                       otherte->pending_prev != NULL &&
                        ready_list != NULL)
                {
                        /* Remove it from pending list ... */
-                       par_list_remove(otherte);
+                       pending_list_remove(otherte);
                        /* ... and add to ready_list */
-                       par_list_append(ready_list, otherte);
+                       ready_list_insert(ready_list, otherte);
                }
        }
 }
index 8dd19159989ccf5f4fbae700660ce88239e6ef5f..26dd0442e8bb840d1c3aa7919a0cee4443fc8695 100644 (file)
@@ -162,12 +162,12 @@ typedef int (*WriteBytePtrType) (ArchiveHandle *AH, const int i);
 typedef int (*ReadBytePtrType) (ArchiveHandle *AH);
 typedef void (*WriteBufPtrType) (ArchiveHandle *AH, const void *c, size_t len);
 typedef void (*ReadBufPtrType) (ArchiveHandle *AH, void *buf, size_t len);
-typedef void (*SaveArchivePtrType) (ArchiveHandle *AH);
 typedef void (*WriteExtraTocPtrType) (ArchiveHandle *AH, TocEntry *te);
 typedef void (*ReadExtraTocPtrType) (ArchiveHandle *AH, TocEntry *te);
 typedef void (*PrintExtraTocPtrType) (ArchiveHandle *AH, TocEntry *te);
 typedef void (*PrintTocDataPtrType) (ArchiveHandle *AH, TocEntry *te);
 
+typedef void (*PrepParallelRestorePtrType) (ArchiveHandle *AH);
 typedef void (*ClonePtrType) (ArchiveHandle *AH);
 typedef void (*DeClonePtrType) (ArchiveHandle *AH);
 
@@ -297,6 +297,7 @@ struct _archiveHandle
        WorkerJobDumpPtrType WorkerJobDumpPtr;
        WorkerJobRestorePtrType WorkerJobRestorePtr;
 
+       PrepParallelRestorePtrType PrepParallelRestorePtr;
        ClonePtrType ClonePtr;          /* Clone format-specific fields */
        DeClonePtrType DeClonePtr;      /* Clean up cloned fields */
 
@@ -387,12 +388,13 @@ struct _tocEntry
        void       *formatData;         /* TOC Entry data specific to file format */
 
        /* working state while dumping/restoring */
+       pgoff_t         dataLength;             /* item's data size; 0 if none or unknown */
        teReqs          reqs;                   /* do we need schema and/or data of object */
        bool            created;                /* set for DATA member if TABLE was created */
 
        /* working state (needed only for parallel restore) */
-       struct _tocEntry *par_prev; /* list links for pending/ready items; */
-       struct _tocEntry *par_next; /* these are NULL if not in either list */
+       struct _tocEntry *pending_prev; /* list links for pending-items list; */
+       struct _tocEntry *pending_next; /* NULL if not in that list */
        int                     depCount;               /* number of dependencies not yet restored */
        DumpId     *revDeps;            /* dumpIds of objects depending on this one */
        int                     nRevDeps;               /* number of such dependencies */
@@ -405,6 +407,18 @@ extern void on_exit_close_archive(Archive *AHX);
 
 extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *modulename, const char *fmt,...) pg_attribute_printf(3, 4);
 
+/* Called to add a TOC entry */
+extern TocEntry *ArchiveEntry(Archive *AHX,
+                        CatalogId catalogId, DumpId dumpId,
+                        const char *tag,
+                        const char *namespace, const char *tablespace,
+                        const char *owner, bool withOids,
+                        const char *desc, teSection section,
+                        const char *defn,
+                        const char *dropStmt, const char *copyStmt,
+                        const DumpId *deps, int nDeps,
+                        DataDumperPtr dumpFn, void *dumpArg);
+
 extern void WriteTOC(ArchiveHandle *AH);
 extern void ReadTOC(ArchiveHandle *AH);
 extern void WriteHead(ArchiveHandle *AH);
index ad18a6c684b91dc26f8f110ef44bf73ec53d95b8..96f44e88b119e7385ecdd541529bf6a6904535b7 100644 (file)
@@ -59,6 +59,8 @@ static void _StartBlob(ArchiveHandle *AH, TocEntry *te, Oid oid);
 static void _EndBlob(ArchiveHandle *AH, TocEntry *te, Oid oid);
 static void _EndBlobs(ArchiveHandle *AH, TocEntry *te);
 static void _LoadBlobs(ArchiveHandle *AH, bool drop);
+
+static void _PrepParallelRestore(ArchiveHandle *AH);
 static void _Clone(ArchiveHandle *AH);
 static void _DeClone(ArchiveHandle *AH);
 
@@ -129,6 +131,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
        AH->StartBlobPtr = _StartBlob;
        AH->EndBlobPtr = _EndBlob;
        AH->EndBlobsPtr = _EndBlobs;
+
+       AH->PrepParallelRestorePtr = _PrepParallelRestore;
        AH->ClonePtr = _Clone;
        AH->DeClonePtr = _DeClone;
 
@@ -775,6 +779,66 @@ _ReopenArchive(ArchiveHandle *AH)
                                          strerror(errno));
 }
 
+/*
+ * Prepare for parallel restore.
+ *
+ * The main thing that needs to happen here is to fill in TABLE DATA and BLOBS
+ * TOC entries' dataLength fields with appropriate values to guide the
+ * ordering of restore jobs.  The source of said data is format-dependent,
+ * as is the exact meaning of the values.
+ *
+ * A format module might also choose to do other setup here.
+ */
+static void
+_PrepParallelRestore(ArchiveHandle *AH)
+{
+       lclContext *ctx = (lclContext *) AH->formatData;
+       TocEntry   *prev_te = NULL;
+       lclTocEntry *prev_tctx = NULL;
+       TocEntry   *te;
+
+       /*
+        * Knowing that the data items were dumped out in TOC order, we can
+        * reconstruct the length of each item as the delta to the start offset of
+        * the next data item.
+        */
+       for (te = AH->toc->next; te != AH->toc; te = te->next)
+       {
+               lclTocEntry *tctx = (lclTocEntry *) te->formatData;
+
+               /*
+                * Ignore entries without a known data offset; if we were unable to
+                * seek to rewrite the TOC when creating the archive, this'll be all
+                * of them, and we'll end up with no size estimates.
+                */
+               if (tctx->dataState != K_OFFSET_POS_SET)
+                       continue;
+
+               /* Compute previous data item's length */
+               if (prev_te)
+               {
+                       if (tctx->dataPos > prev_tctx->dataPos)
+                               prev_te->dataLength = tctx->dataPos - prev_tctx->dataPos;
+               }
+
+               prev_te = te;
+               prev_tctx = tctx;
+       }
+
+       /* If OK to seek, we can determine the length of the last item */
+       if (prev_te && ctx->hasSeek)
+       {
+               pgoff_t         endpos;
+
+               if (fseeko(AH->FH, 0, SEEK_END) != 0)
+                       exit_horribly(modulename, "error during file seek: %s\n",
+                                                 strerror(errno));
+               endpos = ftello(AH->FH);
+               if (endpos > prev_tctx->dataPos)
+                       prev_te->dataLength = endpos - prev_tctx->dataPos;
+       }
+}
+
 /*
  * Clone format-specific fields during parallel restoration.
  */
index 4aabb40f5958ad0702c117ce47f3bfeda1bafd1b..cda90b9a2ad7c3bae52f22c3190d3185bda03d95 100644 (file)
@@ -87,6 +87,7 @@ static void _EndBlob(ArchiveHandle *AH, TocEntry *te, Oid oid);
 static void _EndBlobs(ArchiveHandle *AH, TocEntry *te);
 static void _LoadBlobs(ArchiveHandle *AH);
 
+static void _PrepParallelRestore(ArchiveHandle *AH);
 static void _Clone(ArchiveHandle *AH);
 static void _DeClone(ArchiveHandle *AH);
 
@@ -132,6 +133,7 @@ InitArchiveFmt_Directory(ArchiveHandle *AH)
        AH->EndBlobPtr = _EndBlob;
        AH->EndBlobsPtr = _EndBlobs;
 
+       AH->PrepParallelRestorePtr = _PrepParallelRestore;
        AH->ClonePtr = _Clone;
        AH->DeClonePtr = _DeClone;
 
@@ -240,13 +242,13 @@ _ArchiveEntry(ArchiveHandle *AH, TocEntry *te)
        char            fn[MAXPGPATH];
 
        tctx = (lclTocEntry *) pg_malloc0(sizeof(lclTocEntry));
-       if (te->dataDumper)
+       if (strcmp(te->desc, "BLOBS") == 0)
+               tctx->filename = pg_strdup("blobs.toc");
+       else if (te->dataDumper)
        {
                snprintf(fn, MAXPGPATH, "%d.dat", te->dumpId);
                tctx->filename = pg_strdup(fn);
        }
-       else if (strcmp(te->desc, "BLOBS") == 0)
-               tctx->filename = pg_strdup("blobs.toc");
        else
                tctx->filename = NULL;
 
@@ -726,6 +728,68 @@ setFilePath(ArchiveHandle *AH, char *buf, const char *relativeFilename)
        strcat(buf, relativeFilename);
 }
 
+/*
+ * Prepare for parallel restore.
+ *
+ * The main thing that needs to happen here is to fill in TABLE DATA and BLOBS
+ * TOC entries' dataLength fields with appropriate values to guide the
+ * ordering of restore jobs.  The source of said data is format-dependent,
+ * as is the exact meaning of the values.
+ *
+ * A format module might also choose to do other setup here.
+ */
+static void
+_PrepParallelRestore(ArchiveHandle *AH)
+{
+       TocEntry   *te;
+
+       for (te = AH->toc->next; te != AH->toc; te = te->next)
+       {
+               lclTocEntry *tctx = (lclTocEntry *) te->formatData;
+               char            fname[MAXPGPATH];
+               struct stat st;
+
+               /*
+                * A dumpable object has set tctx->filename, any other object has not.
+                * (see _ArchiveEntry).
+                */
+               if (tctx->filename == NULL)
+                       continue;
+
+               /* We may ignore items not due to be restored */
+               if ((te->reqs & REQ_DATA) == 0)
+                       continue;
+
+               /*
+                * Stat the file and, if successful, put its size in dataLength.  When
+                * using compression, the physical file size might not be a very good
+                * guide to the amount of work involved in restoring the file, but we
+                * only need an approximate indicator of that.
+                */
+               setFilePath(AH, fname, tctx->filename);
+
+               if (stat(fname, &st) == 0)
+                       te->dataLength = st.st_size;
+               else
+               {
+                       /* It might be compressed */
+                       strlcat(fname, ".gz", sizeof(fname));
+                       if (stat(fname, &st) == 0)
+                               te->dataLength = st.st_size;
+               }
+
+               /*
+                * If this is the BLOBS entry, what we stat'd was blobs.toc, which
+                * most likely is a lot smaller than the actual blob data.  We don't
+                * have a cheap way to estimate how much smaller, but fortunately it
+                * doesn't matter too much as long as we get the blobs processed
+                * reasonably early.  Arbitrarily scale up by a factor of 1K.
+                */
+               if (strcmp(te->desc, "BLOBS") == 0)
+                       te->dataLength *= 1024;
+       }
+}
+
 /*
  * Clone format-specific fields during parallel restoration.
  */
index f0ea83e6a97d2ac75baf59ba44774f900f5f06bf..0687a819147cc71971b0d9216e501b7f66432244 100644 (file)
@@ -54,6 +54,7 @@
 #include "catalog/pg_trigger_d.h"
 #include "catalog/pg_type_d.h"
 #include "libpq/libpq-fs.h"
+#include "storage/block.h"
 
 #include "dumputils.h"
 #include "parallel.h"
@@ -845,10 +846,6 @@ main(int argc, char **argv)
         */
        sortDumpableObjectsByTypeName(dobjs, numObjs);
 
-       /* If we do a parallel dump, we want the largest tables to go first */
-       if (archiveFormat == archDirectory && numWorkers > 1)
-               sortDataAndIndexObjectsBySize(dobjs, numObjs);
-
        sortDumpableObjects(dobjs, numObjs,
                                                boundaryObjs[0].dumpId, boundaryObjs[1].dumpId);
 
@@ -2156,13 +2153,28 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
         * See comments for BuildArchiveDependencies.
         */
        if (tdinfo->dobj.dump & DUMP_COMPONENT_DATA)
-               ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
-                                        tbinfo->dobj.name, tbinfo->dobj.namespace->dobj.name,
-                                        NULL, tbinfo->rolname,
-                                        false, "TABLE DATA", SECTION_DATA,
-                                        "", "", copyStmt,
-                                        &(tbinfo->dobj.dumpId), 1,
-                                        dumpFn, tdinfo);
+       {
+               TocEntry   *te;
+
+               te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+                                                 tbinfo->dobj.name, tbinfo->dobj.namespace->dobj.name,
+                                                 NULL, tbinfo->rolname,
+                                                 false, "TABLE DATA", SECTION_DATA,
+                                                 "", "", copyStmt,
+                                                 &(tbinfo->dobj.dumpId), 1,
+                                                 dumpFn, tdinfo);
+
+               /*
+                * Set the TocEntry's dataLength in case we are doing a parallel dump
+                * and want to order dump jobs by table size.  We choose to measure
+                * dataLength in table pages during dump, so no scaling is needed.
+                * However, relpages is declared as "integer" in pg_class, and hence
+                * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+                * Cast so that we get the right interpretation of table sizes
+                * exceeding INT_MAX pages.
+                */
+               te->dataLength = (BlockNumber) tbinfo->relpages;
+       }
 
        destroyPQExpBuffer(copyBuf);
        destroyPQExpBuffer(clistBuf);
@@ -6759,8 +6771,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                i_conoid,
                                i_condef,
                                i_tablespace,
-                               i_indreloptions,
-                               i_relpages;
+                               i_indreloptions;
        int                     ntups;
 
        for (i = 0; i < numTables; i++)
@@ -6807,7 +6818,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                                          "i.indnkeyatts AS indnkeyatts, "
                                                          "i.indnatts AS indnatts, "
                                                          "i.indkey, i.indisclustered, "
-                                                         "i.indisreplident, t.relpages, "
+                                                         "i.indisreplident, "
                                                          "c.contype, c.conname, "
                                                          "c.condeferrable, c.condeferred, "
                                                          "c.tableoid AS contableoid, "
@@ -6844,7 +6855,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                                          "i.indnatts AS indnkeyatts, "
                                                          "i.indnatts AS indnatts, "
                                                          "i.indkey, i.indisclustered, "
-                                                         "i.indisreplident, t.relpages, "
+                                                         "i.indisreplident, "
                                                          "c.contype, c.conname, "
                                                          "c.condeferrable, c.condeferred, "
                                                          "c.tableoid AS contableoid, "
@@ -6877,7 +6888,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                                          "i.indnatts AS indnkeyatts, "
                                                          "i.indnatts AS indnatts, "
                                                          "i.indkey, i.indisclustered, "
-                                                         "false AS indisreplident, t.relpages, "
+                                                         "false AS indisreplident, "
                                                          "c.contype, c.conname, "
                                                          "c.condeferrable, c.condeferred, "
                                                          "c.tableoid AS contableoid, "
@@ -6906,7 +6917,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                                          "i.indnatts AS indnkeyatts, "
                                                          "i.indnatts AS indnatts, "
                                                          "i.indkey, i.indisclustered, "
-                                                         "false AS indisreplident, t.relpages, "
+                                                         "false AS indisreplident, "
                                                          "c.contype, c.conname, "
                                                          "c.condeferrable, c.condeferred, "
                                                          "c.tableoid AS contableoid, "
@@ -6938,7 +6949,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                                                          "t.relnatts AS indnkeyatts, "
                                                          "t.relnatts AS indnatts, "
                                                          "i.indkey, i.indisclustered, "
-                                                         "false AS indisreplident, t.relpages, "
+                                                         "false AS indisreplident, "
                                                          "c.contype, c.conname, "
                                                          "c.condeferrable, c.condeferred, "
                                                          "c.tableoid AS contableoid, "
@@ -6974,7 +6985,6 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                i_indkey = PQfnumber(res, "indkey");
                i_indisclustered = PQfnumber(res, "indisclustered");
                i_indisreplident = PQfnumber(res, "indisreplident");
-               i_relpages = PQfnumber(res, "relpages");
                i_contype = PQfnumber(res, "contype");
                i_conname = PQfnumber(res, "conname");
                i_condeferrable = PQfnumber(res, "condeferrable");
@@ -7013,7 +7023,6 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                        indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't');
                        indxinfo[j].indisreplident = (PQgetvalue(res, j, i_indisreplident)[0] == 't');
                        indxinfo[j].parentidx = atooid(PQgetvalue(res, j, i_parentidx));
-                       indxinfo[j].relpages = atoi(PQgetvalue(res, j, i_relpages));
                        contype = *(PQgetvalue(res, j, i_contype));
 
                        if (contype == 'p' || contype == 'u' || contype == 'x')
@@ -8206,6 +8215,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
                                                          "'' AS attfdwoptions,\n");
 
                if (fout->remoteVersion >= 90100)
+               {
                        /*
                         * Since we only want to dump COLLATE clauses for attributes whose
                         * collation is different from their type's default, we use a CASE
@@ -8214,6 +8224,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
                        appendPQExpBuffer(q,
                                                          "CASE WHEN a.attcollation <> t.typcollation "
                                                          "THEN a.attcollation ELSE 0 END AS attcollation,\n");
+               }
                else
                        appendPQExpBuffer(q,
                                                          "0 AS attcollation,\n");
@@ -8225,8 +8236,8 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
                        appendPQExpBuffer(q,
                                                          "'' AS attoptions\n");
 
+               /* need left join here to not fail on dropped columns ... */
                appendPQExpBuffer(q,
-                                                 /* need left join here to not fail on dropped columns ... */
                                                  "FROM pg_catalog.pg_attribute a LEFT JOIN pg_catalog.pg_type t "
                                                  "ON a.atttypid = t.oid\n"
                                                  "WHERE a.attrelid = '%u'::pg_catalog.oid "
@@ -9772,12 +9783,31 @@ dumpDumpableObject(Archive *fout, DumpableObject *dobj)
                        break;
                case DO_BLOB_DATA:
                        if (dobj->dump & DUMP_COMPONENT_DATA)
-                               ArchiveEntry(fout, dobj->catId, dobj->dumpId,
-                                                        dobj->name, NULL, NULL, "",
-                                                        false, "BLOBS", SECTION_DATA,
-                                                        "", "", NULL,
-                                                        NULL, 0,
-                                                        dumpBlobs, NULL);
+                       {
+                               TocEntry   *te;
+
+                               te = ArchiveEntry(fout, dobj->catId, dobj->dumpId,
+                                                                 dobj->name, NULL, NULL, "",
+                                                                 false, "BLOBS", SECTION_DATA,
+                                                                 "", "", NULL,
+                                                                 NULL, 0,
+                                                                 dumpBlobs, NULL);
+
+                               /*
+                                * Set the TocEntry's dataLength in case we are doing a
+                                * parallel dump and want to order dump jobs by table size.
+                                * (We need some size estimate for every TocEntry with a
+                                * DataDumper function.)  We don't currently have any cheap
+                                * way to estimate the size of blobs, but it doesn't matter;
+                                * let's just set the size to a large value so parallel dumps
+                                * will launch this job first.  If there's lots of blobs, we
+                                * win, and if there aren't, we don't lose much.  (If you want
+                                * to improve on this, really what you should be thinking
+                                * about is allowing blob dumping to be parallelized, not just
+                                * getting a smarter estimate for the single TOC entry.)
+                                */
+                               te->dataLength = MaxBlockNumber;
+                       }
                        break;
                case DO_POLICY:
                        dumpPolicy(fout, (PolicyInfo *) dobj);
index 1448005f303114a4c75542f6ba0542e65c9735cd..685ad78669e6932518afde5ed1dc3323f82dd988 100644 (file)
@@ -370,7 +370,6 @@ typedef struct _indxInfo
        Oid                     parentidx;              /* if partitioned, parent index OID */
        /* if there is an associated constraint object, its dumpId: */
        DumpId          indexconstraint;
-       int                     relpages;               /* relpages of the underlying table */
 } IndxInfo;
 
 typedef struct _indexAttachInfo
@@ -677,7 +676,6 @@ extern void parseOidArray(const char *str, Oid *array, int arraysize);
 extern void sortDumpableObjects(DumpableObject **objs, int numObjs,
                                        DumpId preBoundaryId, DumpId postBoundaryId);
 extern void sortDumpableObjectsByTypeName(DumpableObject **objs, int numObjs);
-extern void sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs);
 
 /*
  * version specific routines
index 6227a8fd26843452f36c62733bbf6026b95581d3..a1d3ced3184a1c14f33deaccc3d71725b6ddc21e 100644 (file)
@@ -35,10 +35,6 @@ static const char *modulename = gettext_noop("sorter");
  * pg_dump.c; that is, PRE_DATA objects must sort before DO_PRE_DATA_BOUNDARY,
  * POST_DATA objects must sort after DO_POST_DATA_BOUNDARY, and DATA objects
  * must sort between them.
- *
- * Note: sortDataAndIndexObjectsBySize wants to have all DO_TABLE_DATA and
- * DO_INDEX objects in contiguous chunks, so do not reuse the values for those
- * for other object types.
  */
 static const int dbObjectTypePriority[] =
 {
@@ -111,96 +107,6 @@ static void repairDependencyLoop(DumpableObject **loop,
 static void describeDumpableObject(DumpableObject *obj,
                                           char *buf, int bufsize);
 
-static int     DOSizeCompare(const void *p1, const void *p2);
-
-static int
-findFirstEqualType(DumpableObjectType type, DumpableObject **objs, int numObjs)
-{
-       int                     i;
-
-       for (i = 0; i < numObjs; i++)
-               if (objs[i]->objType == type)
-                       return i;
-       return -1;
-}
-
-static int
-findFirstDifferentType(DumpableObjectType type, DumpableObject **objs, int numObjs, int start)
-{
-       int                     i;
-
-       for (i = start; i < numObjs; i++)
-               if (objs[i]->objType != type)
-                       return i;
-       return numObjs - 1;
-}
-
-/*
- * When we do a parallel dump, we want to start with the largest items first.
- *
- * Say we have the objects in this order:
- * ....DDDDD....III....
- *
- * with D = Table data, I = Index, . = other object
- *
- * This sorting function now takes each of the D or I blocks and sorts them
- * according to their size.
- */
-void
-sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs)
-{
-       int                     startIdx,
-                               endIdx;
-       void       *startPtr;
-
-       if (numObjs <= 1)
-               return;
-
-       startIdx = findFirstEqualType(DO_TABLE_DATA, objs, numObjs);
-       if (startIdx >= 0)
-       {
-               endIdx = findFirstDifferentType(DO_TABLE_DATA, objs, numObjs, startIdx);
-               startPtr = objs + startIdx;
-               qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *),
-                         DOSizeCompare);
-       }
-
-       startIdx = findFirstEqualType(DO_INDEX, objs, numObjs);
-       if (startIdx >= 0)
-       {
-               endIdx = findFirstDifferentType(DO_INDEX, objs, numObjs, startIdx);
-               startPtr = objs + startIdx;
-               qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *),
-                         DOSizeCompare);
-       }
-}
-
-static int
-DOSizeCompare(const void *p1, const void *p2)
-{
-       DumpableObject *obj1 = *(DumpableObject **) p1;
-       DumpableObject *obj2 = *(DumpableObject **) p2;
-       int                     obj1_size = 0;
-       int                     obj2_size = 0;
-
-       if (obj1->objType == DO_TABLE_DATA)
-               obj1_size = ((TableDataInfo *) obj1)->tdtable->relpages;
-       if (obj1->objType == DO_INDEX)
-               obj1_size = ((IndxInfo *) obj1)->relpages;
-
-       if (obj2->objType == DO_TABLE_DATA)
-               obj2_size = ((TableDataInfo *) obj2)->tdtable->relpages;
-       if (obj2->objType == DO_INDEX)
-               obj2_size = ((IndxInfo *) obj2)->relpages;
-
-       /* we want to see the biggest item go first */
-       if (obj1_size > obj2_size)
-               return -1;
-       if (obj2_size > obj1_size)
-               return 1;
-
-       return 0;
-}
 
 /*
  * Sort the given objects into a type/name-based ordering