From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 13 Apr 2012 19:32:34 +0000 (-0400)
Subject: Weaken the planner's tests for relevant joinclauses.
X-Git-Tag: REL9_2_BETA1~139
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e3ffd05b02468b1a53de31a322cedf195576a625;p=postgresql

Weaken the planner's tests for relevant joinclauses.

We should be willing to cross-join two small relations if that allows us
to use an inner indexscan on a large relation (that is, the potential
indexqual for the large table requires both smaller relations).  This
worked in simple cases but fell apart as soon as there was a join clause
to a fourth relation, because the existence of any two-relation join clause
caused the planner to not consider clauseless joins between other base
relations.  The added regression test shows an example case adapted from
a recent complaint from Benoit Delbosc.

Adjust have_relevant_joinclause, have_relevant_eclass_joinclause, and
has_relevant_eclass_joinclause to consider that a join clause mentioning
three or more relations is sufficient grounds for joining any subset of
those relations, even if we have to do so via a cartesian join.  Since such
clauses are relatively uncommon, this shouldn't affect planning speed on
typical queries; in fact it should help a bit, because the latter two
functions in particular get significantly simpler.

Although this is arguably a bug fix, I'm not going to risk back-patching
it, since it might have currently-unforeseen consequences.
---

diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index c115c2148d..5cdfec542c 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -2035,7 +2035,7 @@ get_parent_relid(PlannerInfo *root, RelOptInfo *rel)
 /*
  * have_relevant_eclass_joinclause
  *		Detect whether there is an EquivalenceClass that could produce
- *		a joinclause between the two given relations.
+ *		a joinclause involving the two given relations.
  *
  * This is essentially a very cut-down version of
  * generate_join_implied_equalities().	Note it's OK to occasionally say "yes"
@@ -2051,9 +2051,6 @@ have_relevant_eclass_joinclause(PlannerInfo *root,
 	foreach(lc1, root->eq_classes)
 	{
 		EquivalenceClass *ec = (EquivalenceClass *) lfirst(lc1);
-		bool		has_rel1;
-		bool		has_rel2;
-		ListCell   *lc2;
 
 		/*
 		 * Won't generate joinclauses if single-member (this test covers the
@@ -2063,9 +2060,18 @@ have_relevant_eclass_joinclause(PlannerInfo *root,
 			continue;
 
 		/*
+		 * We do not need to examine the individual members of the EC, because
+		 * all that we care about is whether each rel overlaps the relids of
+		 * at least one member, and a test on ec_relids is sufficient to prove
+		 * that.  (As with have_relevant_joinclause(), it is not necessary
+		 * that the EC be able to form a joinclause relating exactly the two
+		 * given rels, only that it be able to form a joinclause mentioning
+		 * both, and this will surely be true if both of them overlap
+		 * ec_relids.)
+		 *
 		 * Note we don't test ec_broken; if we did, we'd need a separate code
-		 * path to look through ec_sources.  Checking the members anyway is OK
-		 * as a possibly-overoptimistic heuristic.
+		 * path to look through ec_sources.  Checking the membership anyway is
+		 * OK as a possibly-overoptimistic heuristic.
 		 *
 		 * We don't test ec_has_const either, even though a const eclass won't
 		 * generate real join clauses.	This is because if we had "WHERE a.x =
@@ -2073,35 +2079,8 @@ have_relevant_eclass_joinclause(PlannerInfo *root,
 		 * since the join result is likely to be small even though it'll end
 		 * up being an unqualified nestloop.
 		 */
-
-		/* Needn't scan if it couldn't contain members from each rel */
-		if (!bms_overlap(rel1->relids, ec->ec_relids) ||
-			!bms_overlap(rel2->relids, ec->ec_relids))
-			continue;
-
-		/* Scan the EC to see if it has member(s) in each rel */
-		has_rel1 = has_rel2 = false;
-		foreach(lc2, ec->ec_members)
-		{
-			EquivalenceMember *cur_em = (EquivalenceMember *) lfirst(lc2);
-
-			if (cur_em->em_is_const || cur_em->em_is_child)
-				continue;		/* ignore consts and children here */
-			if (bms_is_subset(cur_em->em_relids, rel1->relids))
-			{
-				has_rel1 = true;
-				if (has_rel2)
-					break;
-			}
-			if (bms_is_subset(cur_em->em_relids, rel2->relids))
-			{
-				has_rel2 = true;
-				if (has_rel1)
-					break;
-			}
-		}
-
-		if (has_rel1 && has_rel2)
+		if (bms_overlap(rel1->relids, ec->ec_relids) &&
+			bms_overlap(rel2->relids, ec->ec_relids))
 			return true;
 	}
 
@@ -2112,7 +2091,7 @@ have_relevant_eclass_joinclause(PlannerInfo *root,
 /*
  * has_relevant_eclass_joinclause
  *		Detect whether there is an EquivalenceClass that could produce
- *		a joinclause between the given relation and anything else.
+ *		a joinclause involving the given relation and anything else.
  *
  * This is the same as have_relevant_eclass_joinclause with the other rel
  * implicitly defined as "everything else in the query".
@@ -2125,9 +2104,6 @@ has_relevant_eclass_joinclause(PlannerInfo *root, RelOptInfo *rel1)
 	foreach(lc1, root->eq_classes)
 	{
 		EquivalenceClass *ec = (EquivalenceClass *) lfirst(lc1);
-		bool		has_rel1;
-		bool		has_rel2;
-		ListCell   *lc2;
 
 		/*
 		 * Won't generate joinclauses if single-member (this test covers the
@@ -2137,45 +2113,11 @@ has_relevant_eclass_joinclause(PlannerInfo *root, RelOptInfo *rel1)
 			continue;
 
 		/*
-		 * Note we don't test ec_broken; if we did, we'd need a separate code
-		 * path to look through ec_sources.  Checking the members anyway is OK
-		 * as a possibly-overoptimistic heuristic.
-		 *
-		 * We don't test ec_has_const either, even though a const eclass won't
-		 * generate real join clauses.	This is because if we had "WHERE a.x =
-		 * b.y and a.x = 42", it is worth considering a join between a and b,
-		 * since the join result is likely to be small even though it'll end
-		 * up being an unqualified nestloop.
+		 * Per the comment in have_relevant_eclass_joinclause, it's sufficient
+		 * to find an EC that mentions both this rel and some other rel.
 		 */
-
-		/* Needn't scan if it couldn't contain members from each rel */
-		if (!bms_overlap(rel1->relids, ec->ec_relids) ||
-			bms_is_subset(ec->ec_relids, rel1->relids))
-			continue;
-
-		/* Scan the EC to see if it has member(s) in each rel */
-		has_rel1 = has_rel2 = false;
-		foreach(lc2, ec->ec_members)
-		{
-			EquivalenceMember *cur_em = (EquivalenceMember *) lfirst(lc2);
-
-			if (cur_em->em_is_const || cur_em->em_is_child)
-				continue;		/* ignore consts and children here */
-			if (bms_is_subset(cur_em->em_relids, rel1->relids))
-			{
-				has_rel1 = true;
-				if (has_rel2)
-					break;
-			}
-			if (!bms_overlap(cur_em->em_relids, rel1->relids))
-			{
-				has_rel2 = true;
-				if (has_rel1)
-					break;
-			}
-		}
-
-		if (has_rel1 && has_rel2)
+		if (bms_overlap(rel1->relids, ec->ec_relids) &&
+			!bms_is_subset(ec->ec_relids, rel1->relids))
 			return true;
 	}
 
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index 4a35d8d3a4..2ad0b969d2 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -88,13 +88,9 @@ join_search_one_level(PlannerInfo *root, int level)
 			has_join_restriction(root, old_rel))
 		{
 			/*
-			 * Note that if all available join clauses for this rel require
-			 * more than one other rel, we will fail to make any joins against
-			 * it here.  In most cases that's OK; it'll be considered by
-			 * "bushy plan" join code in a higher-level pass where we have
-			 * those other rels collected into a join rel.
-			 *
-			 * See also the last-ditch case below.
+			 * There are relevant join clauses or join order restrictions,
+			 * so consider joins between this rel and (only) those rels it is
+			 * linked to by a clause or restriction.
 			 */
 			make_rels_by_clause_joins(root,
 									  old_rel,
@@ -160,8 +156,8 @@ join_search_one_level(PlannerInfo *root, int level)
 				{
 					/*
 					 * OK, we can build a rel of the right level from this
-					 * pair of rels.  Do so if there is at least one usable
-					 * join clause or a relevant join restriction.
+					 * pair of rels.  Do so if there is at least one relevant
+					 * join clause or join order restriction.
 					 */
 					if (have_relevant_joinclause(root, old_rel, new_rel) ||
 						have_join_order_restriction(root, old_rel, new_rel))
diff --git a/src/backend/optimizer/util/joininfo.c b/src/backend/optimizer/util/joininfo.c
index b582ccdc00..20d57c5140 100644
--- a/src/backend/optimizer/util/joininfo.c
+++ b/src/backend/optimizer/util/joininfo.c
@@ -21,34 +21,46 @@
 
 /*
  * have_relevant_joinclause
- *		Detect whether there is a joinclause that can be used to join
+ *		Detect whether there is a joinclause that involves
  *		the two given relations.
+ *
+ * Note: the joinclause does not have to be evaluatable with only these two
+ * relations.  This is intentional.  For example consider
+ *		SELECT * FROM a, b, c WHERE a.x = (b.y + c.z)
+ * If a is much larger than the other tables, it may be worthwhile to
+ * cross-join b and c and then use an inner indexscan on a.x.  Therefore
+ * we should consider this joinclause as reason to join b to c, even though
+ * it can't be applied at that join step.
  */
 bool
 have_relevant_joinclause(PlannerInfo *root,
 						 RelOptInfo *rel1, RelOptInfo *rel2)
 {
 	bool		result = false;
-	Relids		join_relids;
 	List	   *joininfo;
+	Relids		other_relids;
 	ListCell   *l;
 
-	join_relids = bms_union(rel1->relids, rel2->relids);
-
 	/*
 	 * We could scan either relation's joininfo list; may as well use the
 	 * shorter one.
 	 */
 	if (list_length(rel1->joininfo) <= list_length(rel2->joininfo))
+	{
 		joininfo = rel1->joininfo;
+		other_relids = rel2->relids;
+	}
 	else
+	{
 		joininfo = rel2->joininfo;
+		other_relids = rel1->relids;
+	}
 
 	foreach(l, joininfo)
 	{
 		RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
 
-		if (bms_is_subset(rinfo->required_relids, join_relids))
+		if (bms_overlap(other_relids, rinfo->required_relids))
 		{
 			result = true;
 			break;
@@ -62,8 +74,6 @@ have_relevant_joinclause(PlannerInfo *root,
 	if (!result && rel1->has_eclass_joins && rel2->has_eclass_joins)
 		result = have_relevant_eclass_joinclause(root, rel1, rel2);
 
-	bms_free(join_relids);
-
 	return result;
 }
 
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index c0c7283333..e5dceff765 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -2666,6 +2666,57 @@ select * from int4_tbl a full join int4_tbl b on false;
  -2147483647 |            
 (10 rows)
 
+--
+-- test for ability to use a cartesian join when necessary
+--
+explain (costs off)
+select * from
+  tenk1 join int4_tbl on f1 = twothousand,
+  int4(sin(1)) q1,
+  int4(sin(0)) q2
+where q1 = thousand or q2 = thousand;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Hash Join
+   Hash Cond: (tenk1.twothousand = int4_tbl.f1)
+   ->  Nested Loop
+         Join Filter: ((q1.q1 = tenk1.thousand) OR (q2.q2 = tenk1.thousand))
+         ->  Nested Loop
+               ->  Function Scan on q1
+               ->  Function Scan on q2
+         ->  Bitmap Heap Scan on tenk1
+               Recheck Cond: ((q1.q1 = thousand) OR (q2.q2 = thousand))
+               ->  BitmapOr
+                     ->  Bitmap Index Scan on tenk1_thous_tenthous
+                           Index Cond: (q1.q1 = thousand)
+                     ->  Bitmap Index Scan on tenk1_thous_tenthous
+                           Index Cond: (q2.q2 = thousand)
+   ->  Hash
+         ->  Seq Scan on int4_tbl
+(16 rows)
+
+explain (costs off)
+select * from
+  tenk1 join int4_tbl on f1 = twothousand,
+  int4(sin(1)) q1,
+  int4(sin(0)) q2
+where thousand = (q1 + q2);
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Hash Join
+   Hash Cond: (tenk1.twothousand = int4_tbl.f1)
+   ->  Nested Loop
+         ->  Nested Loop
+               ->  Function Scan on q1
+               ->  Function Scan on q2
+         ->  Bitmap Heap Scan on tenk1
+               Recheck Cond: (thousand = (q1.q1 + q2.q2))
+               ->  Bitmap Index Scan on tenk1_thous_tenthous
+                     Index Cond: (thousand = (q1.q1 + q2.q2))
+   ->  Hash
+         ->  Seq Scan on int4_tbl
+(12 rows)
+
 --
 -- test join removal
 --
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 2d53cf1725..bc9b7812cf 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -689,6 +689,24 @@ order by 1,2;
 select * from int4_tbl a full join int4_tbl b on true;
 select * from int4_tbl a full join int4_tbl b on false;
 
+--
+-- test for ability to use a cartesian join when necessary
+--
+
+explain (costs off)
+select * from
+  tenk1 join int4_tbl on f1 = twothousand,
+  int4(sin(1)) q1,
+  int4(sin(0)) q2
+where q1 = thousand or q2 = thousand;
+
+explain (costs off)
+select * from
+  tenk1 join int4_tbl on f1 = twothousand,
+  int4(sin(1)) q1,
+  int4(sin(0)) q2
+where thousand = (q1 + q2);
+
 --
 -- test join removal
 --