*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.145 2008/08/07 01:11:50 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.146 2008/08/14 18:47:59 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "catalog/pg_operator.h"
#include "executor/executor.h"
#include "miscadmin.h"
+#include "optimizer/clauses.h"
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/tlist.h"
+#include "optimizer/var.h"
#include "parser/parse_expr.h"
#include "parser/parsetree.h"
#include "utils/selfuncs.h"
static List *translate_sub_tlist(List *tlist, int relid);
static bool query_is_distinct_for(Query *query, List *colnos, List *opids);
static Oid distinct_col_search(int colno, List *colnos, List *opids);
-static bool hash_safe_operators(List *opids);
/*****************************************************************************
* into different lists, it should be sufficient to use pointer
* comparison to remove duplicates.)
*
- * Always assume the join type is JOIN_INNER; even if some of the join
- * clauses come from other contexts, that's not our problem.
+ * Note that we force the clauses to be treated as non-join clauses
+ * during selectivity estimation.
*/
allclauses = list_union_ptr(rel->baserestrictinfo, allclauses);
pathnode->rows = rel->tuples *
clauselist_selectivity(root,
allclauses,
rel->relid, /* do not use 0! */
- JOIN_INNER);
+ JOIN_INNER,
+ NULL);
/* Like costsize.c, force estimate to be at least one row */
pathnode->rows = clamp_row_est(pathnode->rows);
}
/*
* create_unique_path
* Creates a path representing elimination of distinct rows from the
- * input data.
+ * input data. Distinct-ness is defined according to the needs of the
+ * semijoin represented by sjinfo. If it is not possible to identify
+ * how to make the data unique, NULL is returned.
*
* If used at all, this is likely to be called repeatedly on the same rel;
* and the input subpath should always be the same (the cheapest_total path
* for the rel). So we cache the result.
*/
UniquePath *
-create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
+create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
+ SpecialJoinInfo *sjinfo)
{
UniquePath *pathnode;
Path sort_path; /* dummy for result of cost_sort */
Path agg_path; /* dummy for result of cost_agg */
MemoryContext oldcontext;
- List *sub_targetlist;
List *in_operators;
- ListCell *l;
+ List *uniq_exprs;
+ bool all_btree;
+ bool all_hash;
int numCols;
+ ListCell *lc;
- /* Caller made a mistake if subpath isn't cheapest_total */
+ /* Caller made a mistake if subpath isn't cheapest_total ... */
Assert(subpath == rel->cheapest_total_path);
+ /* ... or if SpecialJoinInfo is the wrong one */
+ Assert(sjinfo->jointype == JOIN_SEMI);
+ Assert(bms_equal(rel->relids, sjinfo->syn_righthand));
/* If result already cached, return it */
if (rel->cheapest_unique_path)
return (UniquePath *) rel->cheapest_unique_path;
+ /* If we previously failed, return NULL quickly */
+ if (sjinfo->join_quals == NIL)
+ return NULL;
+
/*
- * We must ensure path struct is allocated in main planning context;
- * otherwise GEQO memory management causes trouble. (Compare
- * best_inner_indexscan().)
+ * We must ensure path struct and subsidiary data are allocated in main
+ * planning context; otherwise GEQO memory management causes trouble.
+ * (Compare best_inner_indexscan().)
*/
oldcontext = MemoryContextSwitchTo(root->planner_cxt);
- pathnode = makeNode(UniquePath);
+ /*
+ * Look to see whether the semijoin's join quals consist of AND'ed
+ * equality operators, with (only) RHS variables on only one side of
+ * each one. If so, we can figure out how to enforce uniqueness for
+ * the RHS.
+ *
+ * Note that the in_operators list consists of the joinqual operators
+ * themselves (but commuted if needed to put the RHS value on the right).
+ * These could be cross-type operators, in which case the operator
+ * actually needed for uniqueness is a related single-type operator.
+ * We assume here that that operator will be available from the btree
+ * or hash opclass when the time comes ... if not, create_unique_plan()
+ * will fail.
+ */
+ in_operators = NIL;
+ uniq_exprs = NIL;
+ all_btree = true;
+ all_hash = enable_hashagg; /* don't consider hash if not enabled */
+ foreach(lc, sjinfo->join_quals)
+ {
+ OpExpr *op = (OpExpr *) lfirst(lc);
+ Oid opno;
+ Node *left_expr;
+ Node *right_expr;
+ Relids left_varnos;
+ Relids right_varnos;
+
+ /* must be binary opclause... */
+ if (!IsA(op, OpExpr))
+ goto no_unique_path;
+ if (list_length(op->args) != 2)
+ goto no_unique_path;
+ opno = op->opno;
+ left_expr = linitial(op->args);
+ right_expr = lsecond(op->args);
+
+ /* check rel membership of arguments */
+ left_varnos = pull_varnos(left_expr);
+ right_varnos = pull_varnos(right_expr);
+ if (!bms_is_empty(right_varnos) &&
+ bms_is_subset(right_varnos, sjinfo->syn_righthand) &&
+ !bms_overlap(left_varnos, sjinfo->syn_righthand))
+ {
+ /* typical case, right_expr is RHS variable */
+ }
+ else if (!bms_is_empty(left_varnos) &&
+ bms_is_subset(left_varnos, sjinfo->syn_righthand) &&
+ !bms_overlap(right_varnos, sjinfo->syn_righthand))
+ {
+ /* flipped case, left_expr is RHS variable */
+ opno = get_commutator(opno);
+ if (!OidIsValid(opno))
+ goto no_unique_path;
+ right_expr = left_expr;
+ }
+ else
+ goto no_unique_path;
- /* There is no substructure to allocate, so can switch back right away */
- MemoryContextSwitchTo(oldcontext);
+ /* all operators must be btree equality or hash equality */
+ if (all_btree)
+ {
+ /* oprcanmerge is considered a hint... */
+ if (!op_mergejoinable(opno) ||
+ get_mergejoin_opfamilies(opno) == NIL)
+ all_btree = false;
+ }
+ if (all_hash)
+ {
+ /* ... but oprcanhash had better be correct */
+ if (!op_hashjoinable(opno))
+ all_hash = false;
+ }
+ if (!(all_btree || all_hash))
+ goto no_unique_path;
+
+ /* so far so good, keep building lists */
+ in_operators = lappend_oid(in_operators, opno);
+ uniq_exprs = lappend(uniq_exprs, copyObject(right_expr));
+ }
+
+ /*
+ * The expressions we'd need to unique-ify mustn't be volatile.
+ */
+ if (contain_volatile_functions((Node *) uniq_exprs))
+ goto no_unique_path;
+
+ /*
+ * If we get here, we can unique-ify using at least one of sorting
+ * and hashing. Start building the result Path object.
+ */
+ pathnode = makeNode(UniquePath);
pathnode->path.pathtype = T_Unique;
pathnode->path.parent = rel;
pathnode->path.pathkeys = NIL;
pathnode->subpath = subpath;
-
- /*
- * Try to identify the targetlist that will actually be unique-ified. In
- * current usage, this routine is only used for sub-selects of IN clauses,
- * so we should be able to find the tlist in in_info_list. Get the IN
- * clause's operators, too, because they determine what "unique" means.
- */
- sub_targetlist = NIL;
- in_operators = NIL;
- foreach(l, root->in_info_list)
- {
- InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
-
- if (bms_equal(ininfo->righthand, rel->relids))
- {
- sub_targetlist = ininfo->sub_targetlist;
- in_operators = ininfo->in_operators;
- break;
- }
- }
+ pathnode->in_operators = in_operators;
+ pathnode->uniq_exprs = uniq_exprs;
/*
* If the input is a subquery whose output must be unique already, then we
* don't need to do anything. The test for uniqueness has to consider
* exactly which columns we are extracting; for example "SELECT DISTINCT
* x,y" doesn't guarantee that x alone is distinct. So we cannot check for
- * this optimization unless we found our own targetlist above, and it
- * consists only of simple Vars referencing subquery outputs. (Possibly
- * we could do something with expressions in the subquery outputs, too,
- * but for now keep it simple.)
+ * this optimization unless uniq_exprs consists only of simple Vars
+ * referencing subquery outputs. (Possibly we could do something with
+ * expressions in the subquery outputs, too, but for now keep it simple.)
*/
- if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
+ if (rel->rtekind == RTE_SUBQUERY)
{
RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
List *sub_tlist_colnos;
- sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
+ sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid);
if (sub_tlist_colnos &&
query_is_distinct_for(rte->subquery,
rel->cheapest_unique_path = (Path *) pathnode;
+ MemoryContextSwitchTo(oldcontext);
+
return pathnode;
}
}
- /*
- * If we know the targetlist, try to estimate number of result rows;
- * otherwise punt.
- */
- if (sub_targetlist)
- {
- pathnode->rows = estimate_num_groups(root, sub_targetlist, rel->rows);
- numCols = list_length(sub_targetlist);
- }
- else
- {
- pathnode->rows = rel->rows;
- numCols = list_length(rel->reltargetlist);
- }
+ /* Estimate number of output rows */
+ pathnode->rows = estimate_num_groups(root, uniq_exprs, rel->rows);
+ numCols = list_length(uniq_exprs);
- /*
- * Estimate cost for sort+unique implementation
- */
- cost_sort(&sort_path, root, NIL,
- subpath->total_cost,
- rel->rows,
- rel->width,
- -1.0);
+ if (all_btree)
+ {
+ /*
+ * Estimate cost for sort+unique implementation
+ */
+ cost_sort(&sort_path, root, NIL,
+ subpath->total_cost,
+ rel->rows,
+ rel->width,
+ -1.0);
- /*
- * Charge one cpu_operator_cost per comparison per input tuple. We assume
- * all columns get compared at most of the tuples. (XXX probably this is
- * an overestimate.) This should agree with make_unique.
- */
- sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;
+ /*
+ * Charge one cpu_operator_cost per comparison per input tuple.
+ * We assume all columns get compared at most of the tuples. (XXX
+ * probably this is an overestimate.) This should agree with
+ * make_unique.
+ */
+ sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;
+ }
- /*
- * Is it safe to use a hashed implementation? If so, estimate and compare
- * costs. We only try this if we know the IN operators, else we can't
- * check their hashability.
- */
- pathnode->umethod = UNIQUE_PATH_SORT;
- if (enable_hashagg && in_operators && hash_safe_operators(in_operators))
+ if (all_hash)
{
/*
* Estimate the overhead per hashtable entry at 64 bytes (same as in
*/
int hashentrysize = rel->width + 64;
- if (hashentrysize * pathnode->rows <= work_mem * 1024L)
- {
+ if (hashentrysize * pathnode->rows > work_mem * 1024L)
+ all_hash = false; /* don't try to hash */
+ else
cost_agg(&agg_path, root,
AGG_HASHED, 0,
numCols, pathnode->rows,
subpath->startup_cost,
subpath->total_cost,
rel->rows);
- if (agg_path.total_cost < sort_path.total_cost)
- pathnode->umethod = UNIQUE_PATH_HASH;
- }
}
+ if (all_btree && all_hash)
+ {
+ if (agg_path.total_cost < sort_path.total_cost)
+ pathnode->umethod = UNIQUE_PATH_HASH;
+ else
+ pathnode->umethod = UNIQUE_PATH_SORT;
+ }
+ else if (all_btree)
+ pathnode->umethod = UNIQUE_PATH_SORT;
+ else if (all_hash)
+ pathnode->umethod = UNIQUE_PATH_HASH;
+ else
+ goto no_unique_path;
+
if (pathnode->umethod == UNIQUE_PATH_HASH)
{
pathnode->path.startup_cost = agg_path.startup_cost;
rel->cheapest_unique_path = (Path *) pathnode;
+ MemoryContextSwitchTo(oldcontext);
+
return pathnode;
+
+no_unique_path: /* failure exit */
+
+ /* Mark the SpecialJoinInfo as not unique-able */
+ sjinfo->join_quals = NIL;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return NULL;
}
/*
return InvalidOid;
}
-/*
- * hash_safe_operators - can all the specified IN operators be hashed?
- *
- * We assume hashed aggregation will work if each IN operator is marked
- * hashjoinable. If the IN operators are cross-type, this could conceivably
- * fail: the aggregation will need a hashable equality operator for the RHS
- * datatype --- but it's pretty hard to conceive of a hash opfamily that has
- * cross-type hashing without support for hashing the individual types, so
- * we don't expend cycles here to support the case. We could check
- * get_compatible_hash_operator() instead of just op_hashjoinable(), but the
- * former is a significantly more expensive test.
- */
-static bool
-hash_safe_operators(List *opids)
-{
- ListCell *lc;
-
- foreach(lc, opids)
- {
- if (!op_hashjoinable(lfirst_oid(lc)))
- return false;
- }
- return true;
-}
-
/*
* create_subqueryscan_path
* Creates a path corresponding to a sequential scan of a subquery,
*
* 'joinrel' is the join relation.
* 'jointype' is the type of join required
+ * 'sjinfo' is extra info about the join for selectivity estimation
* 'outer_path' is the outer path
* 'inner_path' is the inner path
* 'restrict_clauses' are the RestrictInfo nodes to apply at the join
create_nestloop_path(PlannerInfo *root,
RelOptInfo *joinrel,
JoinType jointype,
+ SpecialJoinInfo *sjinfo,
Path *outer_path,
Path *inner_path,
List *restrict_clauses,
pathnode->joinrestrictinfo = restrict_clauses;
pathnode->path.pathkeys = pathkeys;
- cost_nestloop(pathnode, root);
+ cost_nestloop(pathnode, root, sjinfo);
return pathnode;
}
*
* 'joinrel' is the join relation
* 'jointype' is the type of join required
+ * 'sjinfo' is extra info about the join for selectivity estimation
* 'outer_path' is the outer path
* 'inner_path' is the inner path
* 'restrict_clauses' are the RestrictInfo nodes to apply at the join
create_mergejoin_path(PlannerInfo *root,
RelOptInfo *joinrel,
JoinType jointype,
+ SpecialJoinInfo *sjinfo,
Path *outer_path,
Path *inner_path,
List *restrict_clauses,
pathnode->outersortkeys = outersortkeys;
pathnode->innersortkeys = innersortkeys;
- cost_mergejoin(pathnode, root);
+ cost_mergejoin(pathnode, root, sjinfo);
return pathnode;
}
*
* 'joinrel' is the join relation
* 'jointype' is the type of join required
+ * 'sjinfo' is extra info about the join for selectivity estimation
* 'outer_path' is the cheapest outer path
* 'inner_path' is the cheapest inner path
* 'restrict_clauses' are the RestrictInfo nodes to apply at the join
create_hashjoin_path(PlannerInfo *root,
RelOptInfo *joinrel,
JoinType jointype,
+ SpecialJoinInfo *sjinfo,
Path *outer_path,
Path *inner_path,
List *restrict_clauses,
pathnode->jpath.path.pathkeys = NIL;
pathnode->path_hashclauses = hashclauses;
- cost_hashjoin(pathnode, root);
+ cost_hashjoin(pathnode, root, sjinfo);
return pathnode;
}