From c8e2e0e7129276440d1806dfe4f930c7177ccaac Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 25 Jul 2014 19:48:42 -0400
Subject: [PATCH] Fix a performance problem in pg_dump's dump order selection
 logic.

findDependencyLoops() was not bright about cases where there are multiple
dependency paths between the same two dumpable objects.  In most scenarios
this did not hurt us too badly; but since the introduction of section
boundary pseudo-objects in commit a1ef01fe163b304760088e3e30eb22036910a495,
it was possible for this code to take unreasonable amounts of time (tens
of seconds on a database with a couple thousand objects), as reported in
bug #11033 from Joe Van Dyk.  Joe's particular problem scenario involved
"pg_dump -a" mode with long chains of foreign key constraints, but I think
that similar problems could arise with other situations as long as there
were enough objects.  To fix, add a flag array that lets us notice when we
arrive at the same object again while searching from a given start object.
This simple change seems to be enough to eliminate the performance problem.

Back-patch to 9.1, like the patch that introduced section boundary objects.
---
 src/bin/pg_dump/pg_dump_sort.c | 54 ++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c
index 1b505a0fca..f0caa6b659 100644
--- a/src/bin/pg_dump/pg_dump_sort.c
+++ b/src/bin/pg_dump/pg_dump_sort.c
@@ -137,6 +137,7 @@ static void findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs);
 static int findLoop(DumpableObject *obj,
 		 DumpId startPoint,
 		 bool *processed,
+		 DumpId *searchFailed,
 		 DumpableObject **workspace,
 		 int depth);
 static void repairDependencyLoop(DumpableObject **loop,
@@ -633,21 +634,35 @@ static void
 findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
 {
 	/*
-	 * We use two data structures here.  One is a bool array processed[],
-	 * which is indexed by dump ID and marks the objects already processed
-	 * during this invocation of findDependencyLoops().  The other is a
-	 * workspace[] array of DumpableObject pointers, in which we try to build
-	 * lists of objects constituting loops.  We make workspace[] large enough
-	 * to hold all the objects, which is huge overkill in most cases but could
-	 * theoretically be necessary if there is a single dependency chain
-	 * linking all the objects.
+	 * We use three data structures here:
+	 *
+	 * processed[] is a bool array indexed by dump ID, marking the objects
+	 * already processed during this invocation of findDependencyLoops().
+	 *
+	 * searchFailed[] is another array indexed by dump ID.  searchFailed[j] is
+	 * set to dump ID k if we have proven that there is no dependency path
+	 * leading from object j back to start point k.  This allows us to skip
+	 * useless searching when there are multiple dependency paths from k to j,
+	 * which is a common situation.  We could use a simple bool array for
+	 * this, but then we'd need to re-zero it for each start point, resulting
+	 * in O(N^2) zeroing work.  Using the start point's dump ID as the "true"
+	 * value lets us skip clearing the array before we consider the next start
+	 * point.
+	 *
+	 * workspace[] is an array of DumpableObject pointers, in which we try to
+	 * build lists of objects constituting loops.  We make workspace[] large
+	 * enough to hold all the objects in TopoSort's output, which is huge
+	 * overkill in most cases but could theoretically be necessary if there is
+	 * a single dependency chain linking all the objects.
 	 */
 	bool	   *processed;
+	DumpId	   *searchFailed;
 	DumpableObject **workspace;
 	bool		fixedloop;
 	int			i;
 
 	processed = (bool *) pg_malloc0((getMaxDumpId() + 1) * sizeof(bool));
+	searchFailed = (DumpId *) pg_malloc0((getMaxDumpId() + 1) * sizeof(DumpId));
 	workspace = (DumpableObject **) pg_malloc(totObjs * sizeof(DumpableObject *));
 	fixedloop = false;
 
@@ -657,7 +672,12 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
 		int			looplen;
 		int			j;
 
-		looplen = findLoop(obj, obj->dumpId, processed, workspace, 0);
+		looplen = findLoop(obj,
+						   obj->dumpId,
+						   processed,
+						   searchFailed,
+						   workspace,
+						   0);
 
 		if (looplen > 0)
 		{
@@ -685,6 +705,7 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
 		exit_horribly(modulename, "could not identify dependency loop\n");
 
 	free(workspace);
+	free(searchFailed);
 	free(processed);
 }
 
@@ -695,6 +716,7 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
  *	obj: object we are examining now
  *	startPoint: dumpId of starting object for the hoped-for circular loop
  *	processed[]: flag array marking already-processed objects
+ *	searchFailed[]: flag array marking already-unsuccessfully-visited objects
  *	workspace[]: work array in which we are building list of loop members
  *	depth: number of valid entries in workspace[] at call
  *
@@ -708,6 +730,7 @@ static int
 findLoop(DumpableObject *obj,
 		 DumpId startPoint,
 		 bool *processed,
+		 DumpId *searchFailed,
 		 DumpableObject **workspace,
 		 int depth)
 {
@@ -720,6 +743,13 @@ findLoop(DumpableObject *obj,
 	if (processed[obj->dumpId])
 		return 0;
 
+	/*
+	 * If we've already proven there is no path from this object back to the
+	 * startPoint, forget it.
+	 */
+	if (searchFailed[obj->dumpId] == startPoint)
+		return 0;
+
 	/*
 	 * Reject if obj is already present in workspace.  This test prevents us
 	 * from going into infinite recursion if we are given a startPoint object
@@ -759,12 +789,18 @@ findLoop(DumpableObject *obj,
 		newDepth = findLoop(nextobj,
 							startPoint,
 							processed,
+							searchFailed,
 							workspace,
 							depth);
 		if (newDepth > 0)
 			return newDepth;
 	}
 
+	/*
+	 * Remember there is no path from here back to startPoint
+	 */
+	searchFailed[obj->dumpId] = startPoint;
+
 	return 0;
 }
 
-- 
2.40.0