From b514a7460d9127ddda6598307272c701cbb133b7 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 28 Feb 2015 12:43:04 -0500 Subject: [PATCH] Fix planning of star-schema-style queries. Part of the intent of the parameterized-path mechanism was to handle star-schema queries efficiently, but some overly-restrictive search limiting logic added in commit e2fa76d80ba571d4de8992de6386536867250474 prevented such cases from working as desired. Fix that and add a regression test about it. Per gripe from Marc Cousin. This is arguably a bug rather than a new feature, so back-patch to 9.2 where parameterized paths were introduced. --- src/backend/optimizer/README | 34 +++++++++++++++++----- src/backend/optimizer/path/joinpath.c | 41 ++++++++++++++++++++------- src/test/regress/expected/join.out | 19 +++++++++++++ src/test/regress/sql/join.sql | 9 ++++++ 4 files changed, 86 insertions(+), 17 deletions(-) diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README index 88cdbc3294..6cae9e8b48 100644 --- a/src/backend/optimizer/README +++ b/src/backend/optimizer/README @@ -702,17 +702,37 @@ intermediate layers of joins, for example: -> Index Scan using C_Z_IDX on C Index Condition: C.Z = A.X -If all joins are plain inner joins then this is unnecessary, because -it's always possible to reorder the joins so that a parameter is used +If all joins are plain inner joins then this is usually unnecessary, +because it's possible to reorder the joins so that a parameter is used immediately below the nestloop node that provides it. But in the -presence of outer joins, join reordering may not be possible, and then -this option can be critical. Before version 9.2, Postgres used ad-hoc -methods for planning and executing such queries, and those methods could -not handle passing parameters down through multiple join levels. +presence of outer joins, such join reordering may not be possible. + +Also, the bottom-level scan might require parameters from more than one +other relation. In principle we could join the other relations first +so that all the parameters are supplied from a single nestloop level. +But if those other relations have no join clause in common (which is +common in star-schema queries for instance), the planner won't consider +joining them directly to each other. In such a case we need to be able +to create a plan like + + NestLoop + -> Seq Scan on SmallTable1 A + NestLoop + -> Seq Scan on SmallTable2 B + NestLoop + -> Index Scan using XYIndex on LargeTable C + Index Condition: C.X = A.AID and C.Y = B.BID + +so we should be willing to pass down A.AID through a join even though +there is no join order constraint forcing the plan to look like this. + +Before version 9.2, Postgres used ad-hoc methods for planning and +executing nestloop queries of this kind, and those methods could not +handle passing parameters down through multiple join levels. To plan such queries, we now use a notion of a "parameterized path", which is a path that makes use of a join clause to a relation that's not -scanned by the path. In the example just above, we would construct a +scanned by the path. In the example two above, we would construct a path representing the possibility of doing this: -> Index Scan using C_Z_IDX on C diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index e6aa21c45f..1da953f6d3 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -117,13 +117,14 @@ add_paths_to_joinrel(PlannerInfo *root, /* * Decide whether it's sensible to generate parameterized paths for this * joinrel, and if so, which relations such paths should require. There - * is no need to create a parameterized result path unless there is a join - * order restriction that prevents joining one of our input rels directly - * to the parameter source rel instead of joining to the other input rel. - * This restriction reduces the number of parameterized paths we have to - * deal with at higher join levels, without compromising the quality of - * the resulting plan. We express the restriction as a Relids set that - * must overlap the parameterization of any proposed join path. + * is usually no need to create a parameterized result path unless there + * is a join order restriction that prevents joining one of our input rels + * directly to the parameter source rel instead of joining to the other + * input rel. (But see exception in try_nestloop_path.) This restriction + * reduces the number of parameterized paths we have to deal with at + * higher join levels, without compromising the quality of the resulting + * plan. We express the restriction as a Relids set that must overlap the + * parameterization of any proposed join path. */ foreach(lc, root->join_info_list) { @@ -291,9 +292,29 @@ try_nestloop_path(PlannerInfo *root, if (required_outer && !bms_overlap(required_outer, param_source_rels)) { - /* Waste no memory when we reject a path here */ - bms_free(required_outer); - return; + /* + * We override the param_source_rels heuristic to accept nestloop + * paths in which the outer rel satisfies some but not all of the + * inner path's parameterization. This is necessary to get good plans + * for star-schema scenarios, in which a parameterized path for a + * large table may require parameters from multiple small tables that + * will not get joined directly to each other. We can handle that by + * stacking nestloops that have the small tables on the outside; but + * this breaks the rule the param_source_rels heuristic is based on, + * namely that parameters should not be passed down across joins + * unless there's a join-order-constraint-based reason to do so. So + * ignore the param_source_rels restriction when this case applies. + */ + Relids outerrelids = outer_path->parent->relids; + Relids innerparams = PATH_REQ_OUTER(inner_path); + + if (!(bms_overlap(innerparams, outerrelids) && + bms_nonempty_difference(innerparams, outerrelids))) + { + /* Waste no memory when we reject a path here */ + bms_free(required_outer); + return; + } } /* diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 25011847a0..ca3a17b283 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2780,6 +2780,25 @@ where thousand = (q1 + q2); -> Seq Scan on int4_tbl (12 rows) +-- +-- test ability to generate a suitable plan for a star-schema query +-- +explain (costs off) +select * from + tenk1, int8_tbl a, int8_tbl b +where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2; + QUERY PLAN +--------------------------------------------------------------------- + Nested Loop + -> Seq Scan on int8_tbl b + Filter: (q2 = 2) + -> Nested Loop + -> Seq Scan on int8_tbl a + Filter: (q2 = 1) + -> Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = a.q1) AND (tenthous = b.q1)) +(8 rows) + -- -- test extraction of restriction OR clauses from join OR clause -- (we used to only do this for indexable clauses) diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 718e1d9fc3..6005476c16 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -763,6 +763,15 @@ select * from int4(sin(0)) q2 where thousand = (q1 + q2); +-- +-- test ability to generate a suitable plan for a star-schema query +-- + +explain (costs off) +select * from + tenk1, int8_tbl a, int8_tbl b +where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2; + -- -- test extraction of restriction OR clauses from join OR clause -- (we used to only do this for indexable clauses) -- 2.40.0