From b55722692ba0ceb934bb32bcddb562e2120f43dd Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 11 Mar 2015 21:21:00 -0400 Subject: [PATCH] Improve planner's cost estimation in the presence of semijoins. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit If we have a semijoin, say SELECT * FROM x WHERE x1 IN (SELECT y1 FROM y) and we're estimating the cost of a parameterized indexscan on x, the number of repetitions of the indexscan should not be taken as the size of y; it'll really only be the number of distinct values of y1, because the only valid plan with y on the outside of a nestloop would require y to be unique-ified before joining it to x. Most of the time this doesn't make that much difference, but sometimes it can lead to drastically underestimating the cost of the indexscan and hence choosing a bad plan, as pointed out by David Kubečka. Fixing this is a bit difficult because parameterized indexscans are costed out quite early in the planning process, before we have the information that would be needed to call estimate_num_groups() and thereby estimate the number of distinct values of the join column(s). However we can move the code that extracts a semijoin RHS's unique-ification columns, so that it's done in initsplan.c rather than on-the-fly in create_unique_path(). That shouldn't make any difference speed-wise and it's really a bit cleaner too. The other bit of information we need is the size of the semijoin RHS, which is easy if it's a single relation (we make those estimates before considering indexscan costs) but problematic if it's a join relation. The solution adopted here is just to use the product of the sizes of the join component rels. That will generally be an overestimate, but since estimate_num_groups() only uses this input as a clamp, an overestimate shouldn't hurt us too badly. In any case we don't allow this new logic to produce a value larger than we would have chosen before, so that at worst an overestimate leaves us no wiser than we were before. --- src/backend/nodes/copyfuncs.c | 5 +- src/backend/nodes/equalfuncs.c | 5 +- src/backend/nodes/outfuncs.c | 5 +- src/backend/optimizer/path/costsize.c | 10 +- src/backend/optimizer/path/indxpath.c | 157 ++++++++++++++---- src/backend/optimizer/path/joinrels.c | 5 +- src/backend/optimizer/plan/initsplan.c | 181 ++++++++++++++++++++- src/backend/optimizer/util/orclauses.c | 5 +- src/backend/optimizer/util/pathnode.c | 213 ++++--------------------- src/include/nodes/relation.h | 23 +-- 10 files changed, 385 insertions(+), 224 deletions(-) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index ebb6f3a49b..291e6a7c39 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -1942,7 +1942,10 @@ _copySpecialJoinInfo(const SpecialJoinInfo *from) COPY_SCALAR_FIELD(jointype); COPY_SCALAR_FIELD(lhs_strict); COPY_SCALAR_FIELD(delay_upper_joins); - COPY_NODE_FIELD(join_quals); + COPY_SCALAR_FIELD(semi_can_btree); + COPY_SCALAR_FIELD(semi_can_hash); + COPY_NODE_FIELD(semi_operators); + COPY_NODE_FIELD(semi_rhs_exprs); return newnode; } diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 8186e84d33..fcd58ada3d 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -798,7 +798,10 @@ _equalSpecialJoinInfo(const SpecialJoinInfo *a, const SpecialJoinInfo *b) COMPARE_SCALAR_FIELD(jointype); COMPARE_SCALAR_FIELD(lhs_strict); COMPARE_SCALAR_FIELD(delay_upper_joins); - COMPARE_NODE_FIELD(join_quals); + COMPARE_SCALAR_FIELD(semi_can_btree); + COMPARE_SCALAR_FIELD(semi_can_hash); + COMPARE_NODE_FIELD(semi_operators); + COMPARE_NODE_FIELD(semi_rhs_exprs); return true; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 03f8adaae3..ca596efb5b 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1945,7 +1945,10 @@ _outSpecialJoinInfo(StringInfo str, const SpecialJoinInfo *node) WRITE_ENUM_FIELD(jointype, JoinType); WRITE_BOOL_FIELD(lhs_strict); WRITE_BOOL_FIELD(delay_upper_joins); - WRITE_NODE_FIELD(join_quals); + WRITE_BOOL_FIELD(semi_can_btree); + WRITE_BOOL_FIELD(semi_can_hash); + WRITE_NODE_FIELD(semi_operators); + WRITE_NODE_FIELD(semi_rhs_exprs); } static void diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 5a9daf0f20..1a0d358c5f 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -3294,7 +3294,10 @@ compute_semi_anti_join_factors(PlannerInfo *root, /* we don't bother trying to make the remaining fields valid */ norm_sjinfo.lhs_strict = false; norm_sjinfo.delay_upper_joins = false; - norm_sjinfo.join_quals = NIL; + norm_sjinfo.semi_can_btree = false; + norm_sjinfo.semi_can_hash = false; + norm_sjinfo.semi_operators = NIL; + norm_sjinfo.semi_rhs_exprs = NIL; nselec = clauselist_selectivity(root, joinquals, @@ -3456,7 +3459,10 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals) /* we don't bother trying to make the remaining fields valid */ sjinfo.lhs_strict = false; sjinfo.delay_upper_joins = false; - sjinfo.join_quals = NIL; + sjinfo.semi_can_btree = false; + sjinfo.semi_can_hash = false; + sjinfo.semi_operators = NIL; + sjinfo.semi_rhs_exprs = NIL; /* Get the approximate selectivity */ foreach(l, quals) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index b86a3cd501..da4d7c5284 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -130,7 +130,12 @@ static Relids get_bitmap_tree_required_outer(Path *bitmapqual); static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds); static int find_list_position(Node *node, List **nodelist); static bool check_index_only(RelOptInfo *rel, IndexOptInfo *index); -static double get_loop_count(PlannerInfo *root, Relids outer_relids); +static double get_loop_count(PlannerInfo *root, Index cur_relid, Relids outer_relids); +static double adjust_rowcount_for_semijoins(PlannerInfo *root, + Index cur_relid, + Index outer_relid, + double rowcount); +static double approximate_joinrel_size(PlannerInfo *root, Relids relids); static void match_restriction_clauses_to_index(RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauseset); @@ -402,7 +407,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel) /* And push that path into the mix */ required_outer = get_bitmap_tree_required_outer(bitmapqual); - loop_count = get_loop_count(root, required_outer); + loop_count = get_loop_count(root, rel->relid, required_outer); bpath = create_bitmap_heap_path(root, rel, bitmapqual, required_outer, loop_count); add_path(rel, (Path *) bpath); @@ -969,7 +974,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, outer_relids = NULL; /* Compute loop_count for cost estimation purposes */ - loop_count = get_loop_count(root, outer_relids); + loop_count = get_loop_count(root, rel->relid, outer_relids); /* * 2. Compute pathkeys describing index's ordering, if any, then see how @@ -1553,7 +1558,7 @@ bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel, Path *ipath) cost_bitmap_heap_scan(&bpath.path, root, rel, bpath.path.param_info, ipath, - get_loop_count(root, required_outer)); + get_loop_count(root, rel->relid, required_outer)); return bpath.path.total_cost; } @@ -1594,7 +1599,7 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths) cost_bitmap_heap_scan(&bpath.path, root, rel, bpath.path.param_info, (Path *) &apath, - get_loop_count(root, required_outer)); + get_loop_count(root, rel->relid, required_outer)); return bpath.path.total_cost; } @@ -1861,48 +1866,142 @@ check_index_only(RelOptInfo *rel, IndexOptInfo *index) * answer for single-other-relation cases, and it seems like a reasonable * zero-order approximation for multiway-join cases. * + * In addition, we check to see if the other side of each join clause is on + * the inside of some semijoin that the current relation is on the outside of. + * If so, the only way that a parameterized path could be used is if the + * semijoin RHS has been unique-ified, so we should use the number of unique + * RHS rows rather than using the relation's raw rowcount. + * * Note: for this to work, allpaths.c must establish all baserel size * estimates before it begins to compute paths, or at least before it * calls create_index_paths(). */ static double -get_loop_count(PlannerInfo *root, Relids outer_relids) +get_loop_count(PlannerInfo *root, Index cur_relid, Relids outer_relids) { - double result = 1.0; + double result; + int outer_relid; /* For a non-parameterized path, just return 1.0 quickly */ - if (outer_relids != NULL) + if (outer_relids == NULL) + return 1.0; + + result = 1.0; + outer_relid = -1; + while ((outer_relid = bms_next_member(outer_relids, outer_relid)) >= 0) { - int relid; + RelOptInfo *outer_rel; + double rowcount; - relid = -1; - while ((relid = bms_next_member(outer_relids, relid)) >= 0) - { - RelOptInfo *outer_rel; + /* Paranoia: ignore bogus relid indexes */ + if (outer_relid >= root->simple_rel_array_size) + continue; + outer_rel = root->simple_rel_array[outer_relid]; + if (outer_rel == NULL) + continue; + Assert(outer_rel->relid == outer_relid); /* sanity check on array */ - /* Paranoia: ignore bogus relid indexes */ - if (relid >= root->simple_rel_array_size) - continue; - outer_rel = root->simple_rel_array[relid]; - if (outer_rel == NULL) - continue; - Assert(outer_rel->relid == relid); /* sanity check on array */ + /* Other relation could be proven empty, if so ignore */ + if (IS_DUMMY_REL(outer_rel)) + continue; - /* Other relation could be proven empty, if so ignore */ - if (IS_DUMMY_REL(outer_rel)) - continue; + /* Otherwise, rel's rows estimate should be valid by now */ + Assert(outer_rel->rows > 0); - /* Otherwise, rel's rows estimate should be valid by now */ - Assert(outer_rel->rows > 0); + /* Check to see if rel is on the inside of any semijoins */ + rowcount = adjust_rowcount_for_semijoins(root, + cur_relid, + outer_relid, + outer_rel->rows); - /* Remember smallest row count estimate among the outer rels */ - if (result == 1.0 || result > outer_rel->rows) - result = outer_rel->rows; - } + /* Remember smallest row count estimate among the outer rels */ + if (result == 1.0 || result > rowcount) + result = rowcount; } return result; } +/* + * Check to see if outer_relid is on the inside of any semijoin that cur_relid + * is on the outside of. If so, replace rowcount with the estimated number of + * unique rows from the semijoin RHS (assuming that's smaller, which it might + * not be). The estimate is crude but it's the best we can do at this stage + * of the proceedings. + */ +static double +adjust_rowcount_for_semijoins(PlannerInfo *root, + Index cur_relid, + Index outer_relid, + double rowcount) +{ + ListCell *lc; + + foreach(lc, root->join_info_list) + { + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc); + + if (sjinfo->jointype == JOIN_SEMI && + bms_is_member(cur_relid, sjinfo->syn_lefthand) && + bms_is_member(outer_relid, sjinfo->syn_righthand)) + { + /* Estimate number of unique-ified rows */ + double nraw; + double nunique; + + nraw = approximate_joinrel_size(root, sjinfo->syn_righthand); + nunique = estimate_num_groups(root, + sjinfo->semi_rhs_exprs, + nraw); + if (rowcount > nunique) + rowcount = nunique; + } + } + return rowcount; +} + +/* + * Make an approximate estimate of the size of a joinrel. + * + * We don't have enough info at this point to get a good estimate, so we + * just multiply the base relation sizes together. Fortunately, this is + * the right answer anyway for the most common case with a single relation + * on the RHS of a semijoin. Also, estimate_num_groups() has only a weak + * dependency on its input_rows argument (it basically uses it as a clamp). + * So we might be able to get a fairly decent end result even with a severe + * overestimate of the RHS's raw size. + */ +static double +approximate_joinrel_size(PlannerInfo *root, Relids relids) +{ + double rowcount = 1.0; + int relid; + + relid = -1; + while ((relid = bms_next_member(relids, relid)) >= 0) + { + RelOptInfo *rel; + + /* Paranoia: ignore bogus relid indexes */ + if (relid >= root->simple_rel_array_size) + continue; + rel = root->simple_rel_array[relid]; + if (rel == NULL) + continue; + Assert(rel->relid == relid); /* sanity check on array */ + + /* Relation could be proven empty, if so ignore */ + if (IS_DUMMY_REL(rel)) + continue; + + /* Otherwise, rel's rows estimate should be valid by now */ + Assert(rel->rows > 0); + + /* Accumulate product */ + rowcount *= rel->rows; + } + return rowcount; +} + /**************************************************************************** * ---- ROUTINES TO CHECK QUERY CLAUSES ---- diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index e7e9a1ab49..fe9fd57317 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -624,7 +624,10 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2) /* we don't bother trying to make the remaining fields valid */ sjinfo->lhs_strict = false; sjinfo->delay_upper_joins = false; - sjinfo->join_quals = NIL; + sjinfo->semi_can_btree = false; + sjinfo->semi_can_hash = false; + sjinfo->semi_operators = NIL; + sjinfo->semi_rhs_exprs = NIL; } /* diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 49d776d929..a7655e4a71 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -17,6 +17,7 @@ #include "catalog/pg_type.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" +#include "optimizer/cost.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" @@ -55,6 +56,7 @@ static SpecialJoinInfo *make_outerjoininfo(PlannerInfo *root, Relids left_rels, Relids right_rels, Relids inner_join_rels, JoinType jointype, List *clause); +static void compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause); static void distribute_qual_to_rels(PlannerInfo *root, Node *clause, bool is_deduced, bool below_outer_join, @@ -1085,7 +1087,8 @@ make_outerjoininfo(PlannerInfo *root, sjinfo->jointype = jointype; /* this always starts out false */ sjinfo->delay_upper_joins = false; - sjinfo->join_quals = clause; + + compute_semijoin_info(sjinfo, clause); /* If it's a full join, no need to be very smart */ if (jointype == JOIN_FULL) @@ -1237,6 +1240,182 @@ make_outerjoininfo(PlannerInfo *root, return sjinfo; } +/* + * compute_semijoin_info + * Fill semijoin-related fields of a new SpecialJoinInfo + * + * Note: this relies on only the jointype and syn_righthand fields of the + * SpecialJoinInfo; the rest may not be set yet. + */ +static void +compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause) +{ + List *semi_operators; + List *semi_rhs_exprs; + bool all_btree; + bool all_hash; + ListCell *lc; + + /* Initialize semijoin-related fields in case we can't unique-ify */ + sjinfo->semi_can_btree = false; + sjinfo->semi_can_hash = false; + sjinfo->semi_operators = NIL; + sjinfo->semi_rhs_exprs = NIL; + + /* Nothing more to do if it's not a semijoin */ + if (sjinfo->jointype != JOIN_SEMI) + return; + + /* + * Look to see whether the semijoin's join quals consist of AND'ed + * equality operators, with (only) RHS variables on only one side of each + * one. If so, we can figure out how to enforce uniqueness for the RHS. + * + * Note that the input clause list is the list of quals that are + * *syntactically* associated with the semijoin, which in practice means + * the synthesized comparison list for an IN or the WHERE of an EXISTS. + * Particularly in the latter case, it might contain clauses that aren't + * *semantically* associated with the join, but refer to just one side or + * the other. We can ignore such clauses here, as they will just drop + * down to be processed within one side or the other. (It is okay to + * consider only the syntactically-associated clauses here because for a + * semijoin, no higher-level quals could refer to the RHS, and so there + * can be no other quals that are semantically associated with this join. + * We do things this way because it is useful to have the set of potential + * unique-ification expressions before we can extract the list of quals + * that are actually semantically associated with the particular join.) + * + * Note that the semi_operators list consists of the joinqual operators + * themselves (but commuted if needed to put the RHS value on the right). + * These could be cross-type operators, in which case the operator + * actually needed for uniqueness is a related single-type operator. We + * assume here that that operator will be available from the btree or hash + * opclass when the time comes ... if not, create_unique_plan() will fail. + */ + semi_operators = NIL; + semi_rhs_exprs = NIL; + all_btree = true; + all_hash = enable_hashagg; /* don't consider hash if not enabled */ + foreach(lc, clause) + { + OpExpr *op = (OpExpr *) lfirst(lc); + Oid opno; + Node *left_expr; + Node *right_expr; + Relids left_varnos; + Relids right_varnos; + Relids all_varnos; + Oid opinputtype; + + /* Is it a binary opclause? */ + if (!IsA(op, OpExpr) || + list_length(op->args) != 2) + { + /* No, but does it reference both sides? */ + all_varnos = pull_varnos((Node *) op); + if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || + bms_is_subset(all_varnos, sjinfo->syn_righthand)) + { + /* + * Clause refers to only one rel, so ignore it --- unless it + * contains volatile functions, in which case we'd better + * punt. + */ + if (contain_volatile_functions((Node *) op)) + return; + continue; + } + /* Non-operator clause referencing both sides, must punt */ + return; + } + + /* Extract data from binary opclause */ + opno = op->opno; + left_expr = linitial(op->args); + right_expr = lsecond(op->args); + left_varnos = pull_varnos(left_expr); + right_varnos = pull_varnos(right_expr); + all_varnos = bms_union(left_varnos, right_varnos); + opinputtype = exprType(left_expr); + + /* Does it reference both sides? */ + if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || + bms_is_subset(all_varnos, sjinfo->syn_righthand)) + { + /* + * Clause refers to only one rel, so ignore it --- unless it + * contains volatile functions, in which case we'd better punt. + */ + if (contain_volatile_functions((Node *) op)) + return; + continue; + } + + /* check rel membership of arguments */ + if (!bms_is_empty(right_varnos) && + bms_is_subset(right_varnos, sjinfo->syn_righthand) && + !bms_overlap(left_varnos, sjinfo->syn_righthand)) + { + /* typical case, right_expr is RHS variable */ + } + else if (!bms_is_empty(left_varnos) && + bms_is_subset(left_varnos, sjinfo->syn_righthand) && + !bms_overlap(right_varnos, sjinfo->syn_righthand)) + { + /* flipped case, left_expr is RHS variable */ + opno = get_commutator(opno); + if (!OidIsValid(opno)) + return; + right_expr = left_expr; + } + else + { + /* mixed membership of args, punt */ + return; + } + + /* all operators must be btree equality or hash equality */ + if (all_btree) + { + /* oprcanmerge is considered a hint... */ + if (!op_mergejoinable(opno, opinputtype) || + get_mergejoin_opfamilies(opno) == NIL) + all_btree = false; + } + if (all_hash) + { + /* ... but oprcanhash had better be correct */ + if (!op_hashjoinable(opno, opinputtype)) + all_hash = false; + } + if (!(all_btree || all_hash)) + return; + + /* so far so good, keep building lists */ + semi_operators = lappend_oid(semi_operators, opno); + semi_rhs_exprs = lappend(semi_rhs_exprs, copyObject(right_expr)); + } + + /* Punt if we didn't find at least one column to unique-ify */ + if (semi_rhs_exprs == NIL) + return; + + /* + * The expressions we'd need to unique-ify mustn't be volatile. + */ + if (contain_volatile_functions((Node *) semi_rhs_exprs)) + return; + + /* + * If we get here, we can unique-ify the semijoin's RHS using at least one + * of sorting and hashing. Save the information about how to do that. + */ + sjinfo->semi_can_btree = all_btree; + sjinfo->semi_can_hash = all_hash; + sjinfo->semi_operators = semi_operators; + sjinfo->semi_rhs_exprs = semi_rhs_exprs; +} + /***************************************************************************** * diff --git a/src/backend/optimizer/util/orclauses.c b/src/backend/optimizer/util/orclauses.c index d1c4e99a9b..f0acc143f0 100644 --- a/src/backend/optimizer/util/orclauses.c +++ b/src/backend/optimizer/util/orclauses.c @@ -335,7 +335,10 @@ consider_new_or_clause(PlannerInfo *root, RelOptInfo *rel, /* we don't bother trying to make the remaining fields valid */ sjinfo.lhs_strict = false; sjinfo.delay_upper_joins = false; - sjinfo.join_quals = NIL; + sjinfo.semi_can_btree = false; + sjinfo.semi_can_hash = false; + sjinfo.semi_operators = NIL; + sjinfo.semi_rhs_exprs = NIL; /* Compute inner-join size */ orig_selec = clause_selectivity(root, (Node *) join_or_rinfo, diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 1395a21139..faca30b322 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1088,12 +1088,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, Path sort_path; /* dummy for result of cost_sort */ Path agg_path; /* dummy for result of cost_agg */ MemoryContext oldcontext; - List *in_operators; - List *uniq_exprs; - bool all_btree; - bool all_hash; int numCols; - ListCell *lc; /* Caller made a mistake if subpath isn't cheapest_total ... */ Assert(subpath == rel->cheapest_total_path); @@ -1106,8 +1101,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, if (rel->cheapest_unique_path) return (UniquePath *) rel->cheapest_unique_path; - /* If we previously failed, return NULL quickly */ - if (sjinfo->join_quals == NIL) + /* If it's not possible to unique-ify, return NULL */ + if (!(sjinfo->semi_can_btree || sjinfo->semi_can_hash)) return NULL; /* @@ -1116,150 +1111,6 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, */ oldcontext = MemoryContextSwitchTo(root->planner_cxt); - /*---------- - * Look to see whether the semijoin's join quals consist of AND'ed - * equality operators, with (only) RHS variables on only one side of - * each one. If so, we can figure out how to enforce uniqueness for - * the RHS. - * - * Note that the input join_quals list is the list of quals that are - * *syntactically* associated with the semijoin, which in practice means - * the synthesized comparison list for an IN or the WHERE of an EXISTS. - * Particularly in the latter case, it might contain clauses that aren't - * *semantically* associated with the join, but refer to just one side or - * the other. We can ignore such clauses here, as they will just drop - * down to be processed within one side or the other. (It is okay to - * consider only the syntactically-associated clauses here because for a - * semijoin, no higher-level quals could refer to the RHS, and so there - * can be no other quals that are semantically associated with this join. - * We do things this way because it is useful to be able to run this test - * before we have extracted the list of quals that are actually - * semantically associated with the particular join.) - * - * Note that the in_operators list consists of the joinqual operators - * themselves (but commuted if needed to put the RHS value on the right). - * These could be cross-type operators, in which case the operator - * actually needed for uniqueness is a related single-type operator. - * We assume here that that operator will be available from the btree - * or hash opclass when the time comes ... if not, create_unique_plan() - * will fail. - *---------- - */ - in_operators = NIL; - uniq_exprs = NIL; - all_btree = true; - all_hash = enable_hashagg; /* don't consider hash if not enabled */ - foreach(lc, sjinfo->join_quals) - { - OpExpr *op = (OpExpr *) lfirst(lc); - Oid opno; - Node *left_expr; - Node *right_expr; - Relids left_varnos; - Relids right_varnos; - Relids all_varnos; - Oid opinputtype; - - /* Is it a binary opclause? */ - if (!IsA(op, OpExpr) || - list_length(op->args) != 2) - { - /* No, but does it reference both sides? */ - all_varnos = pull_varnos((Node *) op); - if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || - bms_is_subset(all_varnos, sjinfo->syn_righthand)) - { - /* - * Clause refers to only one rel, so ignore it --- unless it - * contains volatile functions, in which case we'd better - * punt. - */ - if (contain_volatile_functions((Node *) op)) - goto no_unique_path; - continue; - } - /* Non-operator clause referencing both sides, must punt */ - goto no_unique_path; - } - - /* Extract data from binary opclause */ - opno = op->opno; - left_expr = linitial(op->args); - right_expr = lsecond(op->args); - left_varnos = pull_varnos(left_expr); - right_varnos = pull_varnos(right_expr); - all_varnos = bms_union(left_varnos, right_varnos); - opinputtype = exprType(left_expr); - - /* Does it reference both sides? */ - if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || - bms_is_subset(all_varnos, sjinfo->syn_righthand)) - { - /* - * Clause refers to only one rel, so ignore it --- unless it - * contains volatile functions, in which case we'd better punt. - */ - if (contain_volatile_functions((Node *) op)) - goto no_unique_path; - continue; - } - - /* check rel membership of arguments */ - if (!bms_is_empty(right_varnos) && - bms_is_subset(right_varnos, sjinfo->syn_righthand) && - !bms_overlap(left_varnos, sjinfo->syn_righthand)) - { - /* typical case, right_expr is RHS variable */ - } - else if (!bms_is_empty(left_varnos) && - bms_is_subset(left_varnos, sjinfo->syn_righthand) && - !bms_overlap(right_varnos, sjinfo->syn_righthand)) - { - /* flipped case, left_expr is RHS variable */ - opno = get_commutator(opno); - if (!OidIsValid(opno)) - goto no_unique_path; - right_expr = left_expr; - } - else - goto no_unique_path; - - /* all operators must be btree equality or hash equality */ - if (all_btree) - { - /* oprcanmerge is considered a hint... */ - if (!op_mergejoinable(opno, opinputtype) || - get_mergejoin_opfamilies(opno) == NIL) - all_btree = false; - } - if (all_hash) - { - /* ... but oprcanhash had better be correct */ - if (!op_hashjoinable(opno, opinputtype)) - all_hash = false; - } - if (!(all_btree || all_hash)) - goto no_unique_path; - - /* so far so good, keep building lists */ - in_operators = lappend_oid(in_operators, opno); - uniq_exprs = lappend(uniq_exprs, copyObject(right_expr)); - } - - /* Punt if we didn't find at least one column to unique-ify */ - if (uniq_exprs == NIL) - goto no_unique_path; - - /* - * The expressions we'd need to unique-ify mustn't be volatile. - */ - if (contain_volatile_functions((Node *) uniq_exprs)) - goto no_unique_path; - - /* - * If we get here, we can unique-ify using at least one of sorting and - * hashing. Start building the result Path object. - */ pathnode = makeNode(UniquePath); pathnode->path.pathtype = T_Unique; @@ -1273,18 +1124,19 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.pathkeys = NIL; pathnode->subpath = subpath; - pathnode->in_operators = in_operators; - pathnode->uniq_exprs = uniq_exprs; + pathnode->in_operators = sjinfo->semi_operators; + pathnode->uniq_exprs = sjinfo->semi_rhs_exprs; /* * If the input is a relation and it has a unique index that proves the - * uniq_exprs are unique, then we don't need to do anything. Note that - * relation_has_unique_index_for automatically considers restriction + * semi_rhs_exprs are unique, then we don't need to do anything. Note + * that relation_has_unique_index_for automatically considers restriction * clauses for the rel, as well. */ - if (rel->rtekind == RTE_RELATION && all_btree && + if (rel->rtekind == RTE_RELATION && sjinfo->semi_can_btree && relation_has_unique_index_for(root, rel, NIL, - uniq_exprs, in_operators)) + sjinfo->semi_rhs_exprs, + sjinfo->semi_operators)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->path.rows = rel->rows; @@ -1304,7 +1156,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, * don't need to do anything. The test for uniqueness has to consider * exactly which columns we are extracting; for example "SELECT DISTINCT * x,y" doesn't guarantee that x alone is distinct. So we cannot check for - * this optimization unless uniq_exprs consists only of simple Vars + * this optimization unless semi_rhs_exprs consists only of simple Vars * referencing subquery outputs. (Possibly we could do something with * expressions in the subquery outputs, too, but for now keep it simple.) */ @@ -1316,11 +1168,13 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, { List *sub_tlist_colnos; - sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid); + sub_tlist_colnos = translate_sub_tlist(sjinfo->semi_rhs_exprs, + rel->relid); if (sub_tlist_colnos && query_is_distinct_for(rte->subquery, - sub_tlist_colnos, in_operators)) + sub_tlist_colnos, + sjinfo->semi_operators)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->path.rows = rel->rows; @@ -1338,10 +1192,12 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, } /* Estimate number of output rows */ - pathnode->path.rows = estimate_num_groups(root, uniq_exprs, rel->rows); - numCols = list_length(uniq_exprs); + pathnode->path.rows = estimate_num_groups(root, + sjinfo->semi_rhs_exprs, + rel->rows); + numCols = list_length(sjinfo->semi_rhs_exprs); - if (all_btree) + if (sjinfo->semi_can_btree) { /* * Estimate cost for sort+unique implementation @@ -1363,7 +1219,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, sort_path.total_cost += cpu_operator_cost * rel->rows * numCols; } - if (all_hash) + if (sjinfo->semi_can_hash) { /* * Estimate the overhead per hashtable entry at 64 bytes (same as in @@ -1372,7 +1228,13 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, int hashentrysize = rel->width + 64; if (hashentrysize * pathnode->path.rows > work_mem * 1024L) - all_hash = false; /* don't try to hash */ + { + /* + * We should not try to hash. Hack the SpecialJoinInfo to + * remember this, in case we come through here again. + */ + sjinfo->semi_can_hash = false; + } else cost_agg(&agg_path, root, AGG_HASHED, NULL, @@ -1382,19 +1244,23 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, rel->rows); } - if (all_btree && all_hash) + if (sjinfo->semi_can_btree && sjinfo->semi_can_hash) { if (agg_path.total_cost < sort_path.total_cost) pathnode->umethod = UNIQUE_PATH_HASH; else pathnode->umethod = UNIQUE_PATH_SORT; } - else if (all_btree) + else if (sjinfo->semi_can_btree) pathnode->umethod = UNIQUE_PATH_SORT; - else if (all_hash) + else if (sjinfo->semi_can_hash) pathnode->umethod = UNIQUE_PATH_HASH; else - goto no_unique_path; + { + /* we can get here only if we abandoned hashing above */ + MemoryContextSwitchTo(oldcontext); + return NULL; + } if (pathnode->umethod == UNIQUE_PATH_HASH) { @@ -1412,15 +1278,6 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, MemoryContextSwitchTo(oldcontext); return pathnode; - -no_unique_path: /* failure exit */ - - /* Mark the SpecialJoinInfo as not unique-able */ - sjinfo->join_quals = NIL; - - MemoryContextSwitchTo(oldcontext); - - return NULL; } /* diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 6845a403e1..e6dd4e8515 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -101,8 +101,7 @@ typedef struct PlannerGlobal bool transientPlan; /* redo plan when TransactionXmin changes? */ - bool hasRowSecurity; /* row security applied? */ - + bool hasRowSecurity; /* row security applied? */ } PlannerGlobal; /* macro for fetching the Plan associated with a SubPlan node */ @@ -1374,11 +1373,13 @@ typedef struct PlaceHolderVar * commute with this join, because that would leave noplace to check the * pushed-down clause. (We don't track this for FULL JOINs, either.) * - * join_quals is an implicit-AND list of the quals syntactically associated - * with the join (they may or may not end up being applied at the join level). - * This is just a side list and does not drive actual application of quals. - * For JOIN_SEMI joins, this is cleared to NIL in create_unique_path() if - * the join is found not to be suitable for a uniqueify-the-RHS plan. + * For a semijoin, we also extract the join operators and their RHS arguments + * and set semi_operators, semi_rhs_exprs, semi_can_btree, and semi_can_hash. + * This is done in support of possibly unique-ifying the RHS, so we don't + * bother unless at least one of semi_can_btree and semi_can_hash can be set + * true. (You might expect that this information would be computed during + * join planning; but it's helpful to have it available during planning of + * parameterized table scans, so we store it in the SpecialJoinInfo structs.) * * jointype is never JOIN_RIGHT; a RIGHT JOIN is handled by switching * the inputs to make it a LEFT JOIN. So the allowed values of jointype @@ -1391,7 +1392,7 @@ typedef struct PlaceHolderVar * SpecialJoinInfos with jointype == JOIN_INNER for outer joins, since for * cost estimation purposes it is sometimes useful to know the join size under * plain innerjoin semantics. Note that lhs_strict, delay_upper_joins, and - * join_quals are not set meaningfully within such structs. + * of course the semi_xxx fields are not set meaningfully within such structs. */ typedef struct SpecialJoinInfo @@ -1404,7 +1405,11 @@ typedef struct SpecialJoinInfo JoinType jointype; /* always INNER, LEFT, FULL, SEMI, or ANTI */ bool lhs_strict; /* joinclause is strict for some LHS rel */ bool delay_upper_joins; /* can't commute with upper RHS */ - List *join_quals; /* join quals, in implicit-AND list format */ + /* Remaining fields are set only for JOIN_SEMI jointype: */ + bool semi_can_btree; /* true if semi_operators are all btree */ + bool semi_can_hash; /* true if semi_operators are all hash */ + List *semi_operators; /* OIDs of equality join operators */ + List *semi_rhs_exprs; /* righthand-side expressions of these ops */ } SpecialJoinInfo; /* -- 2.40.0