Implement SEMI and ANTI joins in the planner and executor. (Semijoins replace

[postgresql] / src / backend / optimizer / util / pathnode.c
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index 655443e9efe03872fa51249005dad923d141e4ba..5996af2b5d548bcbf53589e73fab88e60254d459 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.145 2008/08/07 01:11:50 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.146 2008/08/14 18:47:59 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -19,10 +19,12 @@
  #include "catalog/pg_operator.h"
  #include "executor/executor.h"
  #include "miscadmin.h"
+#include "optimizer/clauses.h"
  #include "optimizer/cost.h"
  #include "optimizer/pathnode.h"
  #include "optimizer/paths.h"
  #include "optimizer/tlist.h"
+#include "optimizer/var.h"
  #include "parser/parse_expr.h"
  #include "parser/parsetree.h"
  #include "utils/selfuncs.h"
@@ -33,7 +35,6 @@
  static List *translate_sub_tlist(List *tlist, int relid);
  static bool query_is_distinct_for(Query *query, List *colnos, List *opids);
  static Oid     distinct_col_search(int colno, List *colnos, List *opids);
-static bool hash_safe_operators(List *opids);
  
  
  /*****************************************************************************
@@ -481,15 +482,16 @@ create_index_path(PlannerInfo *root,
                  * into different lists, it should be sufficient to use pointer
                  * comparison to remove duplicates.)
                  *
-                * Always assume the join type is JOIN_INNER; even if some of the join
-                * clauses come from other contexts, that's not our problem.
+                * Note that we force the clauses to be treated as non-join clauses
+                * during selectivity estimation.
                  */
                 allclauses = list_union_ptr(rel->baserestrictinfo, allclauses);
                 pathnode->rows = rel->tuples *
                         clauselist_selectivity(root,
                                                                    allclauses,
                                                                    rel->relid,  /* do not use 0! */
-                                                                  JOIN_INNER);
+                                                                  JOIN_INNER,
+                                                                  NULL);
                 /* Like costsize.c, force estimate to be at least one row */
                 pathnode->rows = clamp_row_est(pathnode->rows);
         }
@@ -719,42 +721,141 @@ create_material_path(RelOptInfo *rel, Path *subpath)
  /*
   * create_unique_path
   *       Creates a path representing elimination of distinct rows from the
- *       input data.
+ *       input data.  Distinct-ness is defined according to the needs of the
+ *       semijoin represented by sjinfo.  If it is not possible to identify
+ *       how to make the data unique, NULL is returned.
   *
   * If used at all, this is likely to be called repeatedly on the same rel;
   * and the input subpath should always be the same (the cheapest_total path
   * for the rel).  So we cache the result.
   */
  UniquePath *
-create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
+create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
+                                  SpecialJoinInfo *sjinfo)
  {
         UniquePath *pathnode;
         Path            sort_path;              /* dummy for result of cost_sort */
         Path            agg_path;               /* dummy for result of cost_agg */
         MemoryContext oldcontext;
-       List       *sub_targetlist;
         List       *in_operators;
-       ListCell   *l;
+       List       *uniq_exprs;
+       bool            all_btree;
+       bool            all_hash;
         int                     numCols;
+       ListCell   *lc;
  
-       /* Caller made a mistake if subpath isn't cheapest_total */
+       /* Caller made a mistake if subpath isn't cheapest_total ... */
         Assert(subpath == rel->cheapest_total_path);
+       /* ... or if SpecialJoinInfo is the wrong one */
+       Assert(sjinfo->jointype == JOIN_SEMI);
+       Assert(bms_equal(rel->relids, sjinfo->syn_righthand));
  
         /* If result already cached, return it */
         if (rel->cheapest_unique_path)
                 return (UniquePath *) rel->cheapest_unique_path;
  
+       /* If we previously failed, return NULL quickly */
+       if (sjinfo->join_quals == NIL)
+               return NULL;
+
         /*
-        * We must ensure path struct is allocated in main planning context;
-        * otherwise GEQO memory management causes trouble.  (Compare
-        * best_inner_indexscan().)
+        * We must ensure path struct and subsidiary data are allocated in main
+        * planning context; otherwise GEQO memory management causes trouble.
+        * (Compare best_inner_indexscan().)
          */
         oldcontext = MemoryContextSwitchTo(root->planner_cxt);
  
-       pathnode = makeNode(UniquePath);
+       /*
+        * Look to see whether the semijoin's join quals consist of AND'ed
+        * equality operators, with (only) RHS variables on only one side of
+        * each one.  If so, we can figure out how to enforce uniqueness for
+        * the RHS.
+        *
+        * Note that the in_operators list consists of the joinqual operators
+        * themselves (but commuted if needed to put the RHS value on the right).
+        * These could be cross-type operators, in which case the operator
+        * actually needed for uniqueness is a related single-type operator.
+        * We assume here that that operator will be available from the btree
+        * or hash opclass when the time comes ... if not, create_unique_plan()
+        * will fail.
+        */
+       in_operators = NIL;
+       uniq_exprs = NIL;
+       all_btree = true;
+       all_hash = enable_hashagg;              /* don't consider hash if not enabled */
+       foreach(lc, sjinfo->join_quals)
+       {
+               OpExpr     *op = (OpExpr *) lfirst(lc);
+               Oid                     opno;
+               Node       *left_expr;
+               Node       *right_expr;
+               Relids          left_varnos;
+               Relids          right_varnos;
+
+               /* must be binary opclause... */
+               if (!IsA(op, OpExpr))
+                       goto no_unique_path;
+               if (list_length(op->args) != 2)
+                       goto no_unique_path;
+               opno = op->opno;
+               left_expr = linitial(op->args);
+               right_expr = lsecond(op->args);
+
+               /* check rel membership of arguments */
+               left_varnos = pull_varnos(left_expr);
+               right_varnos = pull_varnos(right_expr);
+               if (!bms_is_empty(right_varnos) &&
+                       bms_is_subset(right_varnos, sjinfo->syn_righthand) &&
+                       !bms_overlap(left_varnos, sjinfo->syn_righthand))
+               {
+                       /* typical case, right_expr is RHS variable */
+               }
+               else if (!bms_is_empty(left_varnos) &&
+                                bms_is_subset(left_varnos, sjinfo->syn_righthand) &&
+                                !bms_overlap(right_varnos, sjinfo->syn_righthand))
+               {
+                       /* flipped case, left_expr is RHS variable */
+                       opno = get_commutator(opno);
+                       if (!OidIsValid(opno))
+                               goto no_unique_path;
+                       right_expr = left_expr;
+               }
+               else
+                       goto no_unique_path;
  
-       /* There is no substructure to allocate, so can switch back right away */
-       MemoryContextSwitchTo(oldcontext);
+               /* all operators must be btree equality or hash equality */
+               if (all_btree)
+               {
+                       /* oprcanmerge is considered a hint... */
+                       if (!op_mergejoinable(opno) ||
+                               get_mergejoin_opfamilies(opno) == NIL)
+                               all_btree = false;
+               }
+               if (all_hash)
+               {
+                       /* ... but oprcanhash had better be correct */
+                       if (!op_hashjoinable(opno))
+                               all_hash = false;
+               }
+               if (!(all_btree || all_hash))
+                       goto no_unique_path;
+
+               /* so far so good, keep building lists */
+               in_operators = lappend_oid(in_operators, opno);
+               uniq_exprs = lappend(uniq_exprs, copyObject(right_expr));
+       }
+
+       /*
+        * The expressions we'd need to unique-ify mustn't be volatile.
+        */
+       if (contain_volatile_functions((Node *) uniq_exprs))
+               goto no_unique_path;
+
+       /*
+        * If we get here, we can unique-ify using at least one of sorting
+        * and hashing.  Start building the result Path object.
+        */
+       pathnode = makeNode(UniquePath);
  
         pathnode->path.pathtype = T_Unique;
         pathnode->path.parent = rel;
@@ -766,43 +867,24 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
         pathnode->path.pathkeys = NIL;
  
         pathnode->subpath = subpath;
-
-       /*
-        * Try to identify the targetlist that will actually be unique-ified. In
-        * current usage, this routine is only used for sub-selects of IN clauses,
-        * so we should be able to find the tlist in in_info_list.      Get the IN
-        * clause's operators, too, because they determine what "unique" means.
-        */
-       sub_targetlist = NIL;
-       in_operators = NIL;
-       foreach(l, root->in_info_list)
-       {
-               InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
-
-               if (bms_equal(ininfo->righthand, rel->relids))
-               {
-                       sub_targetlist = ininfo->sub_targetlist;
-                       in_operators = ininfo->in_operators;
-                       break;
-               }
-       }
+       pathnode->in_operators = in_operators;
+       pathnode->uniq_exprs = uniq_exprs;
  
         /*
          * If the input is a subquery whose output must be unique already, then we
          * don't need to do anything.  The test for uniqueness has to consider
          * exactly which columns we are extracting; for example "SELECT DISTINCT
          * x,y" doesn't guarantee that x alone is distinct. So we cannot check for
-        * this optimization unless we found our own targetlist above, and it
-        * consists only of simple Vars referencing subquery outputs.  (Possibly
-        * we could do something with expressions in the subquery outputs, too,
-        * but for now keep it simple.)
+        * this optimization unless uniq_exprs consists only of simple Vars
+        * referencing subquery outputs.  (Possibly we could do something with
+        * expressions in the subquery outputs, too, but for now keep it simple.)
          */
-       if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
+       if (rel->rtekind == RTE_SUBQUERY)
         {
                 RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
                 List       *sub_tlist_colnos;
  
-               sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
+               sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid);
  
                 if (sub_tlist_colnos &&
                         query_is_distinct_for(rte->subquery,
@@ -816,48 +898,37 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
  
                         rel->cheapest_unique_path = (Path *) pathnode;
  
+                       MemoryContextSwitchTo(oldcontext);
+
                         return pathnode;
                 }
         }
  
-       /*
-        * If we know the targetlist, try to estimate number of result rows;
-        * otherwise punt.
-        */
-       if (sub_targetlist)
-       {
-               pathnode->rows = estimate_num_groups(root, sub_targetlist, rel->rows);
-               numCols = list_length(sub_targetlist);
-       }
-       else
-       {
-               pathnode->rows = rel->rows;
-               numCols = list_length(rel->reltargetlist);
-       }
+       /* Estimate number of output rows */
+       pathnode->rows = estimate_num_groups(root, uniq_exprs, rel->rows);
+       numCols = list_length(uniq_exprs);
  
-       /*
-        * Estimate cost for sort+unique implementation
-        */
-       cost_sort(&sort_path, root, NIL,
-                         subpath->total_cost,
-                         rel->rows,
-                         rel->width,
-                         -1.0);
+       if (all_btree)
+       {
+               /*
+                * Estimate cost for sort+unique implementation
+                */
+               cost_sort(&sort_path, root, NIL,
+                                 subpath->total_cost,
+                                 rel->rows,
+                                 rel->width,
+                                 -1.0);
  
-       /*
-        * Charge one cpu_operator_cost per comparison per input tuple. We assume
-        * all columns get compared at most of the tuples.      (XXX probably this is
-        * an overestimate.)  This should agree with make_unique.
-        */
-       sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;
+               /*
+                * Charge one cpu_operator_cost per comparison per input tuple.
+                * We assume all columns get compared at most of the tuples. (XXX
+                * probably this is an overestimate.)  This should agree with
+                * make_unique.
+                */
+               sort_path.total_cost += cpu_operator_cost * rel->rows * numCols;
+       }
  
-       /*
-        * Is it safe to use a hashed implementation?  If so, estimate and compare
-        * costs.  We only try this if we know the IN operators, else we can't
-        * check their hashability.
-        */
-       pathnode->umethod = UNIQUE_PATH_SORT;
-       if (enable_hashagg && in_operators && hash_safe_operators(in_operators))
+       if (all_hash)
         {
                 /*
                  * Estimate the overhead per hashtable entry at 64 bytes (same as in
@@ -865,19 +936,31 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
                  */
                 int                     hashentrysize = rel->width + 64;
  
-               if (hashentrysize * pathnode->rows <= work_mem * 1024L)
-               {
+               if (hashentrysize * pathnode->rows > work_mem * 1024L)
+                       all_hash = false;       /* don't try to hash */
+               else
                         cost_agg(&agg_path, root,
                                          AGG_HASHED, 0,
                                          numCols, pathnode->rows,
                                          subpath->startup_cost,
                                          subpath->total_cost,
                                          rel->rows);
-                       if (agg_path.total_cost < sort_path.total_cost)
-                               pathnode->umethod = UNIQUE_PATH_HASH;
-               }
         }
  
+       if (all_btree && all_hash)
+       {
+               if (agg_path.total_cost < sort_path.total_cost)
+                       pathnode->umethod = UNIQUE_PATH_HASH;
+               else
+                       pathnode->umethod = UNIQUE_PATH_SORT;
+       }
+       else if (all_btree)
+               pathnode->umethod = UNIQUE_PATH_SORT;
+       else if (all_hash)
+               pathnode->umethod = UNIQUE_PATH_HASH;
+       else
+               goto no_unique_path;
+
         if (pathnode->umethod == UNIQUE_PATH_HASH)
         {
                 pathnode->path.startup_cost = agg_path.startup_cost;
@@ -891,7 +974,18 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
  
         rel->cheapest_unique_path = (Path *) pathnode;
  
+       MemoryContextSwitchTo(oldcontext);
+
         return pathnode;
+
+no_unique_path:                                        /* failure exit */
+
+       /* Mark the SpecialJoinInfo as not unique-able */
+       sjinfo->join_quals = NIL;
+
+       MemoryContextSwitchTo(oldcontext);
+
+       return NULL;
  }
  
  /*
@@ -1068,31 +1162,6 @@ distinct_col_search(int colno, List *colnos, List *opids)
         return InvalidOid;
  }
  
-/*
- * hash_safe_operators - can all the specified IN operators be hashed?
- *
- * We assume hashed aggregation will work if each IN operator is marked
- * hashjoinable.  If the IN operators are cross-type, this could conceivably
- * fail: the aggregation will need a hashable equality operator for the RHS
- * datatype --- but it's pretty hard to conceive of a hash opfamily that has
- * cross-type hashing without support for hashing the individual types, so
- * we don't expend cycles here to support the case.  We could check
- * get_compatible_hash_operator() instead of just op_hashjoinable(), but the
- * former is a significantly more expensive test.
- */
-static bool
-hash_safe_operators(List *opids)
-{
-       ListCell   *lc;
-
-       foreach(lc, opids)
-       {
-               if (!op_hashjoinable(lfirst_oid(lc)))
-                       return false;
-       }
-       return true;
-}
-
  /*
   * create_subqueryscan_path
   *       Creates a path corresponding to a sequential scan of a subquery,
@@ -1157,6 +1226,7 @@ create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel)
   *
   * 'joinrel' is the join relation.
   * 'jointype' is the type of join required
+ * 'sjinfo' is extra info about the join for selectivity estimation
   * 'outer_path' is the outer path
   * 'inner_path' is the inner path
   * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
@@ -1168,6 +1238,7 @@ NestPath *
  create_nestloop_path(PlannerInfo *root,
                                          RelOptInfo *joinrel,
                                          JoinType jointype,
+                                        SpecialJoinInfo *sjinfo,
                                          Path *outer_path,
                                          Path *inner_path,
                                          List *restrict_clauses,
@@ -1183,7 +1254,7 @@ create_nestloop_path(PlannerInfo *root,
         pathnode->joinrestrictinfo = restrict_clauses;
         pathnode->path.pathkeys = pathkeys;
  
-       cost_nestloop(pathnode, root);
+       cost_nestloop(pathnode, root, sjinfo);
  
         return pathnode;
  }
@@ -1195,6 +1266,7 @@ create_nestloop_path(PlannerInfo *root,
   *
   * 'joinrel' is the join relation
   * 'jointype' is the type of join required
+ * 'sjinfo' is extra info about the join for selectivity estimation
   * 'outer_path' is the outer path
   * 'inner_path' is the inner path
   * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
@@ -1208,6 +1280,7 @@ MergePath *
  create_mergejoin_path(PlannerInfo *root,
                                           RelOptInfo *joinrel,
                                           JoinType jointype,
+                                         SpecialJoinInfo *sjinfo,
                                           Path *outer_path,
                                           Path *inner_path,
                                           List *restrict_clauses,
@@ -1256,7 +1329,7 @@ create_mergejoin_path(PlannerInfo *root,
         pathnode->outersortkeys = outersortkeys;
         pathnode->innersortkeys = innersortkeys;
  
-       cost_mergejoin(pathnode, root);
+       cost_mergejoin(pathnode, root, sjinfo);
  
         return pathnode;
  }
@@ -1267,6 +1340,7 @@ create_mergejoin_path(PlannerInfo *root,
   *
   * 'joinrel' is the join relation
   * 'jointype' is the type of join required
+ * 'sjinfo' is extra info about the join for selectivity estimation
   * 'outer_path' is the cheapest outer path
   * 'inner_path' is the cheapest inner path
   * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
@@ -1277,6 +1351,7 @@ HashPath *
  create_hashjoin_path(PlannerInfo *root,
                                          RelOptInfo *joinrel,
                                          JoinType jointype,
+                                        SpecialJoinInfo *sjinfo,
                                          Path *outer_path,
                                          Path *inner_path,
                                          List *restrict_clauses,
@@ -1294,7 +1369,7 @@ create_hashjoin_path(PlannerInfo *root,
         pathnode->jpath.path.pathkeys = NIL;
         pathnode->path_hashclauses = hashclauses;
  
-       cost_hashjoin(pathnode, root);
+       cost_hashjoin(pathnode, root, sjinfo);
  
         return pathnode;
  }