1 /*-------------------------------------------------------------------------
4 * Routines to compute clause selectivities
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/optimizer/path/clausesel.c,v 1.91 2008/08/14 18:47:59 tgl Exp $
13 *-------------------------------------------------------------------------
17 #include "catalog/pg_operator.h"
18 #include "nodes/makefuncs.h"
19 #include "optimizer/clauses.h"
20 #include "optimizer/cost.h"
21 #include "optimizer/pathnode.h"
22 #include "optimizer/plancat.h"
23 #include "parser/parsetree.h"
24 #include "utils/fmgroids.h"
25 #include "utils/lsyscache.h"
26 #include "utils/selfuncs.h"
30 * Data structure for accumulating info about possible range-query
31 * clause pairs in clauselist_selectivity.
33 typedef struct RangeQueryClause
35 struct RangeQueryClause *next; /* next in linked list */
36 Node *var; /* The common variable of the clauses */
37 bool have_lobound; /* found a low-bound clause yet? */
38 bool have_hibound; /* found a high-bound clause yet? */
39 Selectivity lobound; /* Selectivity of a var > something clause */
40 Selectivity hibound; /* Selectivity of a var < something clause */
43 static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
44 bool varonleft, bool isLTsel, Selectivity s2);
47 /****************************************************************************
48 * ROUTINES TO COMPUTE SELECTIVITIES
49 ****************************************************************************/
52 * clauselist_selectivity -
53 * Compute the selectivity of an implicitly-ANDed list of boolean
54 * expression clauses. The list can be empty, in which case 1.0
55 * must be returned. List elements may be either RestrictInfos
56 * or bare expression clauses --- the former is preferred since
57 * it allows caching of results.
59 * See clause_selectivity() for the meaning of the additional parameters.
61 * Our basic approach is to take the product of the selectivities of the
62 * subclauses. However, that's only right if the subclauses have independent
63 * probabilities, and in reality they are often NOT independent. So,
64 * we want to be smarter where we can.
66 * Currently, the only extra smarts we have is to recognize "range queries",
67 * such as "x > 34 AND x < 42". Clauses are recognized as possible range
68 * query components if they are restriction opclauses whose operators have
69 * scalarltsel() or scalargtsel() as their restriction selectivity estimator.
70 * We pair up clauses of this form that refer to the same variable. An
71 * unpairable clause of this kind is simply multiplied into the selectivity
72 * product in the normal way. But when we find a pair, we know that the
73 * selectivities represent the relative positions of the low and high bounds
74 * within the column's range, so instead of figuring the selectivity as
75 * hisel * losel, we can figure it as hisel + losel - 1. (To visualize this,
76 * see that hisel is the fraction of the range below the high bound, while
77 * losel is the fraction above the low bound; so hisel can be interpreted
78 * directly as a 0..1 value but we need to convert losel to 1-losel before
79 * interpreting it as a value. Then the available range is 1-losel to hisel.
80 * However, this calculation double-excludes nulls, so really we need
81 * hisel + losel + null_frac - 1.)
83 * If either selectivity is exactly DEFAULT_INEQ_SEL, we forget this equation
84 * and instead use DEFAULT_RANGE_INEQ_SEL. The same applies if the equation
85 * yields an impossible (negative) result.
87 * A free side-effect is that we can recognize redundant inequalities such
88 * as "x < 4 AND x < 5"; only the tighter constraint will be counted.
90 * Of course this is all very dependent on the behavior of
91 * scalarltsel/scalargtsel; perhaps some day we can generalize the approach.
94 clauselist_selectivity(PlannerInfo *root,
98 SpecialJoinInfo *sjinfo)
100 Selectivity s1 = 1.0;
101 RangeQueryClause *rqlist = NULL;
105 * If there's exactly one clause, then no use in trying to match up
106 * pairs, so just go directly to clause_selectivity().
108 if (list_length(clauses) == 1)
109 return clause_selectivity(root, (Node *) linitial(clauses),
110 varRelid, jointype, sjinfo);
113 * Initial scan over clauses. Anything that doesn't look like a potential
114 * rangequery clause gets multiplied into s1 and forgotten. Anything that
115 * does gets inserted into an rqlist entry.
119 Node *clause = (Node *) lfirst(l);
123 /* Always compute the selectivity using clause_selectivity */
124 s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
127 * Check for being passed a RestrictInfo.
129 * If it's a pseudoconstant RestrictInfo, then s2 is either 1.0 or
130 * 0.0; just use that rather than looking for range pairs.
132 if (IsA(clause, RestrictInfo))
134 rinfo = (RestrictInfo *) clause;
135 if (rinfo->pseudoconstant)
140 clause = (Node *) rinfo->clause;
146 * See if it looks like a restriction clause with a pseudoconstant on
147 * one side. (Anything more complicated than that might not behave in
148 * the simple way we are expecting.) Most of the tests here can be
149 * done more efficiently with rinfo than without.
151 if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2)
153 OpExpr *expr = (OpExpr *) clause;
154 bool varonleft = true;
159 ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) &&
160 (is_pseudo_constant_clause_relids(lsecond(expr->args),
161 rinfo->right_relids) ||
163 is_pseudo_constant_clause_relids(linitial(expr->args),
164 rinfo->left_relids)));
168 ok = (NumRelids(clause) == 1) &&
169 (is_pseudo_constant_clause(lsecond(expr->args)) ||
171 is_pseudo_constant_clause(linitial(expr->args))));
177 * If it's not a "<" or ">" operator, just merge the
178 * selectivity in generically. But if it's the right oprrest,
179 * add the clause to rqlist for later processing.
181 switch (get_oprrest(expr->opno))
184 addRangeClause(&rqlist, clause,
185 varonleft, true, s2);
188 addRangeClause(&rqlist, clause,
189 varonleft, false, s2);
192 /* Just merge the selectivity in generically */
196 continue; /* drop to loop bottom */
200 /* Not the right form, so treat it generically. */
205 * Now scan the rangequery pair list.
207 while (rqlist != NULL)
209 RangeQueryClause *rqnext;
211 if (rqlist->have_lobound && rqlist->have_hibound)
213 /* Successfully matched a pair of range clauses */
217 * Exact equality to the default value probably means the
218 * selectivity function punted. This is not airtight but should
221 if (rqlist->hibound == DEFAULT_INEQ_SEL ||
222 rqlist->lobound == DEFAULT_INEQ_SEL)
224 s2 = DEFAULT_RANGE_INEQ_SEL;
228 s2 = rqlist->hibound + rqlist->lobound - 1.0;
230 /* Adjust for double-exclusion of NULLs */
231 s2 += nulltestsel(root, IS_NULL, rqlist->var,
232 varRelid, jointype, sjinfo);
235 * A zero or slightly negative s2 should be converted into a
236 * small positive value; we probably are dealing with a very
237 * tight range and got a bogus result due to roundoff errors.
238 * However, if s2 is very negative, then we probably have
239 * default selectivity estimates on one or both sides of the
240 * range that we failed to recognize above for some reason.
247 * No data available --- use a default estimate that
248 * is small, but not real small.
250 s2 = DEFAULT_RANGE_INEQ_SEL;
255 * It's just roundoff error; use a small positive
262 /* Merge in the selectivity of the pair of clauses */
267 /* Only found one of a pair, merge it in generically */
268 if (rqlist->have_lobound)
269 s1 *= rqlist->lobound;
271 s1 *= rqlist->hibound;
273 /* release storage and advance */
274 rqnext = rqlist->next;
283 * addRangeClause --- add a new range clause for clauselist_selectivity
285 * Here is where we try to match up pairs of range-query clauses
288 addRangeClause(RangeQueryClause **rqlist, Node *clause,
289 bool varonleft, bool isLTsel, Selectivity s2)
291 RangeQueryClause *rqelem;
297 var = get_leftop((Expr *) clause);
298 is_lobound = !isLTsel; /* x < something is high bound */
302 var = get_rightop((Expr *) clause);
303 is_lobound = isLTsel; /* something < x is low bound */
306 for (rqelem = *rqlist; rqelem; rqelem = rqelem->next)
309 * We use full equal() here because the "var" might be a function of
310 * one or more attributes of the same relation...
312 if (!equal(var, rqelem->var))
314 /* Found the right group to put this clause in */
317 if (!rqelem->have_lobound)
319 rqelem->have_lobound = true;
320 rqelem->lobound = s2;
326 * We have found two similar clauses, such as
328 * Keep only the more restrictive one.
331 if (rqelem->lobound > s2)
332 rqelem->lobound = s2;
337 if (!rqelem->have_hibound)
339 rqelem->have_hibound = true;
340 rqelem->hibound = s2;
346 * We have found two similar clauses, such as
348 * Keep only the more restrictive one.
351 if (rqelem->hibound > s2)
352 rqelem->hibound = s2;
358 /* No matching var found, so make a new clause-pair data structure */
359 rqelem = (RangeQueryClause *) palloc(sizeof(RangeQueryClause));
363 rqelem->have_lobound = true;
364 rqelem->have_hibound = false;
365 rqelem->lobound = s2;
369 rqelem->have_lobound = false;
370 rqelem->have_hibound = true;
371 rqelem->hibound = s2;
373 rqelem->next = *rqlist;
378 * bms_is_subset_singleton
380 * Same result as bms_is_subset(s, bms_make_singleton(x)),
381 * but a little faster and doesn't leak memory.
383 * Is this of use anywhere else? If so move to bitmapset.c ...
386 bms_is_subset_singleton(const Bitmapset *s, int x)
388 switch (bms_membership(s))
393 return bms_is_member(x, s);
397 /* can't get here... */
403 * clause_selectivity -
404 * Compute the selectivity of a general boolean expression clause.
406 * The clause can be either a RestrictInfo or a plain expression. If it's
407 * a RestrictInfo, we try to cache the selectivity for possible re-use,
408 * so passing RestrictInfos is preferred.
410 * varRelid is either 0 or a rangetable index.
412 * When varRelid is not 0, only variables belonging to that relation are
413 * considered in computing selectivity; other vars are treated as constants
414 * of unknown values. This is appropriate for estimating the selectivity of
415 * a join clause that is being used as a restriction clause in a scan of a
416 * nestloop join's inner relation --- varRelid should then be the ID of the
419 * When varRelid is 0, all variables are treated as variables. This
420 * is appropriate for ordinary join clauses and restriction clauses.
422 * jointype is the join type, if the clause is a join clause. Pass JOIN_INNER
423 * if the clause isn't a join clause.
425 * sjinfo is NULL for a non-join clause, otherwise it provides additional
426 * context information about the join being performed. There are some
428 * 1. For a special (not INNER) join, sjinfo is always a member of
429 * root->join_info_list.
430 * 2. For an INNER join, sjinfo is just a transient struct, and only the
431 * relids and jointype fields in it can be trusted.
432 * 3. XXX sjinfo might be NULL even though it really is a join. This case
433 * will go away soon, but fixing it requires API changes for oprjoin and
434 * amcostestimate functions.
435 * It is possible for jointype to be different from sjinfo->jointype.
436 * This indicates we are considering a variant join: either with
437 * the LHS and RHS switched, or with one input unique-ified.
439 * Note: when passing nonzero varRelid, it's normally appropriate to set
440 * jointype == JOIN_INNER, sjinfo == NULL, even if the clause is really a
441 * join clause; because we aren't treating it as a join clause.
444 clause_selectivity(PlannerInfo *root,
448 SpecialJoinInfo *sjinfo)
450 Selectivity s1 = 0.5; /* default for any unhandled clause type */
451 RestrictInfo *rinfo = NULL;
452 bool cacheable = false;
454 if (clause == NULL) /* can this still happen? */
457 if (IsA(clause, RestrictInfo))
459 rinfo = (RestrictInfo *) clause;
462 * If the clause is marked pseudoconstant, then it will be used as a
463 * gating qual and should not affect selectivity estimates; hence
464 * return 1.0. The only exception is that a constant FALSE may be
465 * taken as having selectivity 0.0, since it will surely mean no rows
466 * out of the plan. This case is simple enough that we need not
467 * bother caching the result.
469 if (rinfo->pseudoconstant)
471 if (!IsA(rinfo->clause, Const))
472 return (Selectivity) 1.0;
476 * If possible, cache the result of the selectivity calculation for
477 * the clause. We can cache if varRelid is zero or the clause
478 * contains only vars of that relid --- otherwise varRelid will affect
479 * the result, so mustn't cache.
482 bms_is_subset_singleton(rinfo->clause_relids, varRelid))
484 /* Cacheable --- do we already have the result? */
485 if (rinfo->this_selec >= 0)
486 return rinfo->this_selec;
491 * Proceed with examination of contained clause. If the clause is an
492 * OR-clause, we want to look at the variant with sub-RestrictInfos,
493 * so that per-subclause selectivities can be cached.
496 clause = (Node *) rinfo->orclause;
498 clause = (Node *) rinfo->clause;
501 if (IsA(clause, Var))
503 Var *var = (Var *) clause;
506 * We probably shouldn't ever see an uplevel Var here, but if we do,
507 * return the default selectivity...
509 if (var->varlevelsup == 0 &&
510 (varRelid == 0 || varRelid == (int) var->varno))
512 RangeTblEntry *rte = planner_rt_fetch(var->varno, root);
514 if (rte->rtekind == RTE_SUBQUERY)
517 * XXX not smart about subquery references... any way to do
518 * better than default?
524 * A Var at the top of a clause must be a bool Var. This is
525 * equivalent to the clause reln.attribute = 't', so we
526 * compute the selectivity as if that is what we have.
528 s1 = restriction_selectivity(root,
529 BooleanEqualOperator,
537 else if (IsA(clause, Const))
539 /* bool constant is pretty easy... */
540 Const *con = (Const *) clause;
542 s1 = con->constisnull ? 0.0 :
543 DatumGetBool(con->constvalue) ? 1.0 : 0.0;
545 else if (IsA(clause, Param))
547 /* see if we can replace the Param */
548 Node *subst = estimate_expression_value(root, clause);
550 if (IsA(subst, Const))
552 /* bool constant is pretty easy... */
553 Const *con = (Const *) subst;
555 s1 = con->constisnull ? 0.0 :
556 DatumGetBool(con->constvalue) ? 1.0 : 0.0;
560 /* XXX any way to do better than default? */
563 else if (not_clause(clause))
565 /* inverse of the selectivity of the underlying clause */
566 s1 = 1.0 - clause_selectivity(root,
567 (Node *) get_notclausearg((Expr *) clause),
572 else if (and_clause(clause))
574 /* share code with clauselist_selectivity() */
575 s1 = clauselist_selectivity(root,
576 ((BoolExpr *) clause)->args,
581 else if (or_clause(clause))
584 * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to
585 * account for the probable overlap of selected tuple sets.
587 * XXX is this too conservative?
592 foreach(arg, ((BoolExpr *) clause)->args)
594 Selectivity s2 = clause_selectivity(root,
595 (Node *) lfirst(arg),
600 s1 = s1 + s2 - s1 * s2;
603 else if (is_opclause(clause) || IsA(clause, DistinctExpr))
605 Oid opno = ((OpExpr *) clause)->opno;
611 * If we are considering a nestloop join then all clauses are
612 * restriction clauses, since we are only interested in the one
615 is_join_clause = false;
620 * Otherwise, it's a join if there's more than one relation used.
621 * We can optimize this calculation if an rinfo was passed.
624 is_join_clause = (bms_membership(rinfo->clause_relids) ==
627 is_join_clause = (NumRelids(clause) > 1);
632 /* Estimate selectivity for a join clause. */
633 s1 = join_selectivity(root, opno,
634 ((OpExpr *) clause)->args,
639 /* Estimate selectivity for a restriction clause. */
640 s1 = restriction_selectivity(root, opno,
641 ((OpExpr *) clause)->args,
646 * DistinctExpr has the same representation as OpExpr, but the
647 * contained operator is "=" not "<>", so we must negate the result.
648 * This estimation method doesn't give the right behavior for nulls,
649 * but it's better than doing nothing.
651 if (IsA(clause, DistinctExpr))
654 else if (is_funcclause(clause))
657 * This is not an operator, so we guess at the selectivity. THIS IS A
658 * HACK TO GET V4 OUT THE DOOR. FUNCS SHOULD BE ABLE TO HAVE
659 * SELECTIVITIES THEMSELVES. -- JMH 7/9/92
661 s1 = (Selectivity) 0.3333333;
664 else if (is_subplan(clause))
667 * Just for the moment! FIX ME! - vadim 02/04/98
669 s1 = (Selectivity) 0.5;
672 else if (IsA(clause, ScalarArrayOpExpr))
674 /* First, decide if it's a join clause, same as for OpExpr */
680 * If we are considering a nestloop join then all clauses are
681 * restriction clauses, since we are only interested in the one
684 is_join_clause = false;
689 * Otherwise, it's a join if there's more than one relation used.
690 * We can optimize this calculation if an rinfo was passed.
693 is_join_clause = (bms_membership(rinfo->clause_relids) ==
696 is_join_clause = (NumRelids(clause) > 1);
699 /* Use node specific selectivity calculation function */
700 s1 = scalararraysel(root,
701 (ScalarArrayOpExpr *) clause,
707 else if (IsA(clause, RowCompareExpr))
709 /* Use node specific selectivity calculation function */
710 s1 = rowcomparesel(root,
711 (RowCompareExpr *) clause,
716 else if (IsA(clause, NullTest))
718 /* Use node specific selectivity calculation function */
719 s1 = nulltestsel(root,
720 ((NullTest *) clause)->nulltesttype,
721 (Node *) ((NullTest *) clause)->arg,
726 else if (IsA(clause, BooleanTest))
728 /* Use node specific selectivity calculation function */
729 s1 = booltestsel(root,
730 ((BooleanTest *) clause)->booltesttype,
731 (Node *) ((BooleanTest *) clause)->arg,
736 else if (IsA(clause, CurrentOfExpr))
738 /* CURRENT OF selects at most one row of its table */
739 CurrentOfExpr *cexpr = (CurrentOfExpr *) clause;
740 RelOptInfo *crel = find_base_rel(root, cexpr->cvarno);
742 if (crel->tuples > 0)
743 s1 = 1.0 / crel->tuples;
745 else if (IsA(clause, RelabelType))
747 /* Not sure this case is needed, but it can't hurt */
748 s1 = clause_selectivity(root,
749 (Node *) ((RelabelType *) clause)->arg,
754 else if (IsA(clause, CoerceToDomain))
756 /* Not sure this case is needed, but it can't hurt */
757 s1 = clause_selectivity(root,
758 (Node *) ((CoerceToDomain *) clause)->arg,
764 /* Cache the result if possible */
766 rinfo->this_selec = s1;
768 #ifdef SELECTIVITY_DEBUG
769 elog(DEBUG4, "clause_selectivity: s1 %f", s1);
770 #endif /* SELECTIVITY_DEBUG */