From: Tom Lane Date: Fri, 6 Feb 2009 23:43:24 +0000 (+0000) Subject: Fix cost_mergejoin's failure to adjust for rescanning of non-unique merge join X-Git-Tag: REL8_4_BETA1~299 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c473d923515e03fe5fd43c2ca15d52363a93f488;p=postgresql Fix cost_mergejoin's failure to adjust for rescanning of non-unique merge join keys when considering a semi or anti join. This requires estimating the selectivity of the merge qual as though it were a regular inner join condition. To allow caching both that and the real outer-join-aware selectivity, split RestrictInfo.this_selec into two fields. This fixes one of the problems reported by Kevin Grittner. --- diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index de57c874e9..bc4232bdc8 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -15,7 +15,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.422 2009/02/02 19:31:39 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.423 2009/02/06 23:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1606,7 +1606,8 @@ _copyRestrictInfo(RestrictInfo *from) /* EquivalenceClasses are never copied, so shallow-copy the pointers */ COPY_SCALAR_FIELD(parent_ec); COPY_SCALAR_FIELD(eval_cost); - COPY_SCALAR_FIELD(this_selec); + COPY_SCALAR_FIELD(norm_selec); + COPY_SCALAR_FIELD(outer_selec); COPY_NODE_FIELD(mergeopfamilies); /* EquivalenceClasses are never copied, so shallow-copy the pointers */ COPY_SCALAR_FIELD(left_ec); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 74df2f3061..5dc9db98bf 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.351 2009/02/02 19:31:39 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.352 2009/02/06 23:43:23 tgl Exp $ * * NOTES * Every node type that can appear in stored rules' parsetrees *must* @@ -1609,7 +1609,8 @@ _outRestrictInfo(StringInfo str, RestrictInfo *node) WRITE_BITMAPSET_FIELD(right_relids); WRITE_NODE_FIELD(orclause); /* don't write parent_ec, leads to infinite recursion in plan tree dump */ - WRITE_FLOAT_FIELD(this_selec, "%.4f"); + WRITE_FLOAT_FIELD(norm_selec, "%.4f"); + WRITE_FLOAT_FIELD(outer_selec, "%.4f"); WRITE_NODE_FIELD(mergeopfamilies); /* don't write left_ec, leads to infinite recursion in plan tree dump */ /* don't write right_ec, leads to infinite recursion in plan tree dump */ diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index e9a94e7339..ee02689d29 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/clausesel.c,v 1.96 2009/01/01 17:23:43 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/clausesel.c,v 1.97 2009/02/06 23:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -516,21 +516,34 @@ clause_selectivity(PlannerInfo *root, /* * If the clause is marked redundant, always return 1.0. */ - if (rinfo->this_selec > 1) + if (rinfo->norm_selec > 1) return (Selectivity) 1.0; /* * If possible, cache the result of the selectivity calculation for * the clause. We can cache if varRelid is zero or the clause * contains only vars of that relid --- otherwise varRelid will affect - * the result, so mustn't cache. + * the result, so mustn't cache. Outer join quals might be examined + * with either their join's actual jointype or JOIN_INNER, so we need + * two cache variables to remember both cases. Note: we assume the + * result won't change if we are switching the input relations or + * considering a unique-ified case, so we only need one cache variable + * for all non-JOIN_INNER cases. */ if (varRelid == 0 || bms_is_subset_singleton(rinfo->clause_relids, varRelid)) { /* Cacheable --- do we already have the result? */ - if (rinfo->this_selec >= 0) - return rinfo->this_selec; + if (jointype == JOIN_INNER) + { + if (rinfo->norm_selec >= 0) + return rinfo->norm_selec; + } + else + { + if (rinfo->outer_selec >= 0) + return rinfo->outer_selec; + } cacheable = true; } @@ -753,7 +766,12 @@ clause_selectivity(PlannerInfo *root, /* Cache the result if possible */ if (cacheable) - rinfo->this_selec = s1; + { + if (jointype == JOIN_INNER) + rinfo->norm_selec = s1; + else + rinfo->outer_selec = s1; + } #ifdef SELECTIVITY_DEBUG elog(DEBUG4, "clause_selectivity: s1 %f", s1); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 1f8f62314e..07ddf43c8d 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -54,7 +54,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.203 2009/01/01 17:23:43 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.204 2009/02/06 23:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -120,7 +120,7 @@ static MergeScanSelCache *cached_scansel(PlannerInfo *root, PathKey *pathkey); static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context); static double approx_tuple_count(PlannerInfo *root, JoinPath *path, - List *quals, SpecialJoinInfo *sjinfo); + List *quals); static void set_rel_width(PlannerInfo *root, RelOptInfo *rel); static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); @@ -1507,11 +1507,9 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * Get approx # tuples passing the mergequals. We use approx_tuple_count - * here for speed --- in most cases, any errors won't affect the result - * much. + * here because we need an estimate done with JOIN_INNER semantics. */ - mergejointuples = approx_tuple_count(root, &path->jpath, - mergeclauses, sjinfo); + mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses); /* * When there are equal merge keys in the outer relation, the mergejoin @@ -1539,16 +1537,10 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) * when we should not. Can we do better without expensive selectivity * computations? * - * For SEMI and ANTI joins, only one inner tuple need be rescanned for - * each group of same-keyed outer tuples (assuming that all joinquals - * are merge quals). This makes the effect small enough to ignore, - * so we just set rescannedtuples = 0. Likewise, the whole issue is - * moot if we are working from a unique-ified outer input. + * The whole issue is moot if we are working from a unique-ified outer + * input. */ - if (sjinfo->jointype == JOIN_SEMI || - sjinfo->jointype == JOIN_ANTI) - rescannedtuples = 0; - else if (IsA(outer_path, UniquePath)) + if (IsA(outer_path, UniquePath)) rescannedtuples = 0; else { @@ -1847,11 +1839,9 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * Get approx # tuples passing the hashquals. We use approx_tuple_count - * here for speed --- in most cases, any errors won't affect the result - * much. + * here because we need an estimate done with JOIN_INNER semantics. */ - hashjointuples = approx_tuple_count(root, &path->jpath, - hashclauses, sjinfo); + hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses); /* cost of source data */ startup_cost += outer_path->startup_cost; @@ -2324,6 +2314,11 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) * The quals can be either an implicitly-ANDed list of boolean expressions, * or a list of RestrictInfo nodes (typically the latter). * + * We intentionally compute the selectivity under JOIN_INNER rules, even + * if it's some type of outer join. This is appropriate because we are + * trying to figure out how many tuples pass the initial merge or hash + * join step. + * * This is quick-and-dirty because we bypass clauselist_selectivity, and * simply multiply the independent clause selectivities together. Now * clauselist_selectivity often can't do any better than that anyhow, but @@ -2336,31 +2331,40 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) * seems OK to live with the approximation. */ static double -approx_tuple_count(PlannerInfo *root, JoinPath *path, - List *quals, SpecialJoinInfo *sjinfo) +approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals) { double tuples; double outer_tuples = path->outerjoinpath->parent->rows; double inner_tuples = path->innerjoinpath->parent->rows; + SpecialJoinInfo sjinfo; Selectivity selec = 1.0; ListCell *l; + /* + * Make up a SpecialJoinInfo for JOIN_INNER semantics. + */ + sjinfo.type = T_SpecialJoinInfo; + sjinfo.min_lefthand = path->outerjoinpath->parent->relids; + sjinfo.min_righthand = path->innerjoinpath->parent->relids; + sjinfo.syn_lefthand = path->outerjoinpath->parent->relids; + sjinfo.syn_righthand = path->innerjoinpath->parent->relids; + sjinfo.jointype = JOIN_INNER; + /* we don't bother trying to make the remaining fields valid */ + sjinfo.lhs_strict = false; + sjinfo.delay_upper_joins = false; + sjinfo.join_quals = NIL; + /* Get the approximate selectivity */ foreach(l, quals) { Node *qual = (Node *) lfirst(l); /* Note that clause_selectivity will be able to cache its result */ - selec *= clause_selectivity(root, qual, 0, sjinfo->jointype, sjinfo); + selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo); } - /* Apply it correctly using the input relation sizes */ - if (sjinfo->jointype == JOIN_SEMI) - tuples = selec * outer_tuples; - else if (sjinfo->jointype == JOIN_ANTI) - tuples = (1.0 - selec) * outer_tuples; - else - tuples = selec * outer_tuples * inner_tuples; + /* Apply it to the input relation sizes */ + tuples = selec * outer_tuples * inner_tuples; return clamp_row_est(tuples); } diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index cbad81603e..bc4544e5e0 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -10,7 +10,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/equivclass.c,v 1.16 2009/01/01 17:23:43 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/equivclass.c,v 1.17 2009/02/06 23:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1200,7 +1200,8 @@ reconsider_outer_join_clauses(PlannerInfo *root) list_delete_cell(root->left_join_clauses, cell, prev); /* we throw it back anyway (see notes above) */ /* but the thrown-back clause has no extra selectivity */ - rinfo->this_selec = 2.0; + rinfo->norm_selec = 2.0; + rinfo->outer_selec = 1.0; distribute_restrictinfo_to_rels(root, rinfo); } else @@ -1222,7 +1223,8 @@ reconsider_outer_join_clauses(PlannerInfo *root) list_delete_cell(root->right_join_clauses, cell, prev); /* we throw it back anyway (see notes above) */ /* but the thrown-back clause has no extra selectivity */ - rinfo->this_selec = 2.0; + rinfo->norm_selec = 2.0; + rinfo->outer_selec = 1.0; distribute_restrictinfo_to_rels(root, rinfo); } else @@ -1244,7 +1246,8 @@ reconsider_outer_join_clauses(PlannerInfo *root) list_delete_cell(root->full_join_clauses, cell, prev); /* we throw it back anyway (see notes above) */ /* but the thrown-back clause has no extra selectivity */ - rinfo->this_selec = 2.0; + rinfo->norm_selec = 2.0; + rinfo->outer_selec = 1.0; distribute_restrictinfo_to_rels(root, rinfo); } else diff --git a/src/backend/optimizer/path/orindxpath.c b/src/backend/optimizer/path/orindxpath.c index a82f1c8b82..638078e169 100644 --- a/src/backend/optimizer/path/orindxpath.c +++ b/src/backend/optimizer/path/orindxpath.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.86 2009/01/01 17:23:44 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.87 2009/02/06 23:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -174,10 +174,11 @@ create_or_index_quals(PlannerInfo *root, RelOptInfo *rel) { orig_selec = clause_selectivity(root, (Node *) bestrinfo, 0, JOIN_INNER, NULL); - bestrinfo->this_selec = orig_selec / or_selec; + bestrinfo->norm_selec = orig_selec / or_selec; /* clamp result to sane range */ - if (bestrinfo->this_selec > 1) - bestrinfo->this_selec = 1; + if (bestrinfo->norm_selec > 1) + bestrinfo->norm_selec = 1; + /* It isn't an outer join clause, so no need to adjust outer_selec */ } /* Tell caller to recompute rel's rows estimate */ diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 1ef4532e5e..b9ce6d2ed0 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -22,7 +22,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.164 2009/01/01 17:23:44 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.165 2009/02/06 23:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1662,7 +1662,8 @@ adjust_appendrel_attrs_mutator(Node *node, AppendRelInfo *context) * different values when considering the child relation. */ newinfo->eval_cost.startup = -1; - newinfo->this_selec = -1; + newinfo->norm_selec = -1; + newinfo->outer_selec = -1; newinfo->left_ec = NULL; newinfo->right_ec = NULL; newinfo->left_em = NULL; diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c index ddf7daf1ff..22e2aeb493 100644 --- a/src/backend/optimizer/util/restrictinfo.c +++ b/src/backend/optimizer/util/restrictinfo.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/restrictinfo.c,v 1.56 2009/01/01 17:23:45 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/restrictinfo.c,v 1.57 2009/02/06 23:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -338,7 +338,8 @@ make_restrictinfo_internal(Expr *clause, restrictinfo->parent_ec = NULL; restrictinfo->eval_cost.startup = -1; - restrictinfo->this_selec = -1; + restrictinfo->norm_selec = -1; + restrictinfo->outer_selec = -1; restrictinfo->mergeopfamilies = NIL; diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 259b6e1739..f00d1becc7 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.167 2009/01/01 17:24:00 momjian Exp $ + * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.168 2009/02/06 23:43:24 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -992,8 +992,11 @@ typedef struct RestrictInfo /* cache space for cost and selectivity */ QualCost eval_cost; /* eval cost of clause; -1 if not yet set */ - Selectivity this_selec; /* selectivity; -1 if not yet set; >1 means + Selectivity norm_selec; /* selectivity for "normal" (JOIN_INNER) + * semantics; -1 if not yet set; >1 means * a redundant clause */ + Selectivity outer_selec; /* selectivity for outer join semantics; + * -1 if not yet set */ /* valid if clause is mergejoinable, else NIL */ List *mergeopfamilies; /* opfamilies containing clause operator */