Improve relation width estimation for subqueries.

author Tom Lane <tgl@sss.pgh.pa.us>

Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c

index aa9a90cbfa2e67ef572b683f9b8938725ff271b8..ce893a77be79134b2de6039a610194496be22cfd 100644 (file)
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -758,11 +758,8 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
         rel->subrtable = subroot->parse->rtable;
         rel->subrowmark = subroot->rowMarks;
  
-       /* Copy number of output rows from subplan */
-       rel->tuples = rel->subplan->plan_rows;
-
         /* Mark rel with estimated output rows, width, etc */
-       set_baserel_size_estimates(root, rel);
+       set_subquery_size_estimates(root, rel, subroot);
  
         /* Convert subquery pathkeys to outer representation */
         pathkeys = convert_subquery_pathkeys(root, rel, subroot->query_pathkeys);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index 16a5d0a3ca277a8ad4d735eb43940bc6f46560f7..0724f9a6c9cefdfac57661556b71d24c1b2d94ac 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -76,6 +76,7 @@
  #include "optimizer/cost.h"
  #include "optimizer/pathnode.h"
  #include "optimizer/placeholder.h"
+#include "optimizer/plancat.h"
  #include "optimizer/planmain.h"
  #include "optimizer/restrictinfo.h"
  #include "parser/parsetree.h"
@@ -2986,7 +2987,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
   *             Set the size estimates for the given base relation.
   *
   * The rel's targetlist and restrictinfo list must have been constructed
- * already.
+ * already, and rel->tuples must be set.
   *
   * We set the following fields of the rel node:
   *     rows: the estimated number of output tuples (after applying
@@ -3151,6 +3152,76 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
         rel->rows = clamp_row_est(nrows);
  }
  
+/*
+ * set_subquery_size_estimates
+ *             Set the size estimates for a base relation that is a subquery.
+ *
+ * The rel's targetlist and restrictinfo list must have been constructed
+ * already, and the plan for the subquery must have been completed.
+ * We look at the subquery's plan and PlannerInfo to extract data.
+ *
+ * We set the same fields as set_baserel_size_estimates.
+ */
+void
+set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
+                                                       PlannerInfo *subroot)
+{
+       RangeTblEntry *rte;
+       ListCell   *lc;
+
+       /* Should only be applied to base relations that are subqueries */
+       Assert(rel->relid > 0);
+       rte = planner_rt_fetch(rel->relid, root);
+       Assert(rte->rtekind == RTE_SUBQUERY);
+
+       /* Copy raw number of output rows from subplan */
+       rel->tuples = rel->subplan->plan_rows;
+
+       /*
+        * Compute per-output-column width estimates by examining the subquery's
+        * targetlist.  For any output that is a plain Var, get the width estimate
+        * that was made while planning the subquery.  Otherwise, fall back on a
+        * datatype-based estimate.
+        */
+       foreach(lc, subroot->parse->targetList)
+       {
+               TargetEntry *te = (TargetEntry *) lfirst(lc);
+               Node       *texpr = (Node *) te->expr;
+               int32           item_width;
+
+               Assert(IsA(te, TargetEntry));
+               /* junk columns aren't visible to upper query */
+               if (te->resjunk)
+                       continue;
+
+               /*
+                * XXX This currently doesn't work for subqueries containing set
+                * operations, because the Vars in their tlists are bogus references
+                * to the first leaf subquery, which wouldn't give the right answer
+                * even if we could still get to its PlannerInfo.  So fall back on
+                * datatype in that case.
+                */
+               if (IsA(texpr, Var) &&
+                       subroot->parse->setOperations == NULL)
+               {
+                       Var        *var = (Var *) texpr;
+                       RelOptInfo *subrel = find_base_rel(subroot, var->varno);
+
+                       item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
+               }
+               else
+               {
+                       item_width = get_typavgwidth(exprType(texpr), exprTypmod(texpr));
+               }
+               Assert(item_width > 0);
+               Assert(te->resno >= rel->min_attr && te->resno <= rel->max_attr);
+               rel->attr_widths[te->resno - rel->min_attr] = item_width;
+       }
+
+       /* Now estimate number of output rows, etc */
+       set_baserel_size_estimates(root, rel);
+}
+
  /*
   * set_function_size_estimates
   *             Set the size estimates for a base relation that is a function call.
@@ -3251,11 +3322,17 @@ set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, Plan *cteplan)
   * set_rel_width
   *             Set the estimated output width of a base relation.
   *
+ * The estimated output width is the sum of the per-attribute width estimates
+ * for the actually-referenced columns, plus any PHVs or other expressions
+ * that have to be calculated at this relation.  This is the amount of data
+ * we'd need to pass upwards in case of a sort, hash, etc.
+ *
   * NB: this works best on plain relations because it prefers to look at
- * real Vars.  It will fail to make use of pg_statistic info when applied
- * to a subquery relation, even if the subquery outputs are simple vars
- * that we could have gotten info for. Is it worth trying to be smarter
- * about subqueries?
+ * real Vars.  For subqueries, set_subquery_size_estimates will already have
+ * copied up whatever per-column estimates were made within the subquery,
+ * and for other types of rels there isn't much we can do anyway.  We fall
+ * back on (fairly stupid) datatype-based width estimates if we can't get
+ * any better number.
   *
   * The per-attribute width estimates are cached for possible re-use while
   * building join relations.
@@ -3265,6 +3342,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
  {
         Oid                     reloid = planner_rt_fetch(rel->relid, root)->relid;
         int32           tuple_width = 0;
+       bool            have_wholerow_var = false;
         ListCell   *lc;
  
         foreach(lc, rel->reltargetlist)
@@ -3284,8 +3362,18 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
                         ndx = var->varattno - rel->min_attr;
  
                         /*
-                        * The width probably hasn't been cached yet, but may as well
-                        * check
+                        * If it's a whole-row Var, we'll deal with it below after we
+                        * have already cached as many attr widths as possible.
+                        */
+                       if (var->varattno == 0)
+                       {
+                               have_wholerow_var = true;
+                               continue;
+                       }
+
+                       /*
+                        * The width may have been cached already (especially if it's
+                        * a subquery), so don't duplicate effort.
                          */
                         if (rel->attr_widths[ndx] > 0)
                         {
@@ -3294,7 +3382,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
                         }
  
                         /* Try to get column width from statistics */
-                       if (reloid != InvalidOid)
+                       if (reloid != InvalidOid && var->varattno > 0)
                         {
                                 item_width = get_attavgwidth(reloid, var->varattno);
                                 if (item_width > 0)
@@ -3335,6 +3423,39 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
                         tuple_width += item_width;
                 }
         }
+
+       /*
+        * If we have a whole-row reference, estimate its width as the sum of
+        * per-column widths plus sizeof(HeapTupleHeaderData).
+        */
+       if (have_wholerow_var)
+       {
+               int32   wholerow_width = sizeof(HeapTupleHeaderData);
+
+               if (reloid != InvalidOid)
+               {
+                       /* Real relation, so estimate true tuple width */
+                       wholerow_width += get_relation_data_width(reloid,
+                                                                                                         rel->attr_widths - rel->min_attr);
+               }
+               else
+               {
+                       /* Do what we can with info for a phony rel */
+                       AttrNumber      i;
+
+                       for (i = 1; i <= rel->max_attr; i++)
+                               wholerow_width += rel->attr_widths[i - rel->min_attr];
+               }
+
+               rel->attr_widths[0 - rel->min_attr] = wholerow_width;
+
+               /*
+                * Include the whole-row Var as part of the output tuple.  Yes,
+                * that really is what happens at runtime.
+                */
+               tuple_width += wholerow_width;
+       }
+
         Assert(tuple_width >= 0);
         rel->width = tuple_width;
  }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 6324bce2403068a70d5495f61fb40fc68de6f9b5..a1e59005921eb9fe7b076c24d8c8c25317d3958e 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3102,7 +3102,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
          * set_baserel_size_estimates, just do a quick hack for rows and width.
          */
         rel->rows = rel->tuples;
-       rel->width = get_relation_data_width(tableOid);
+       rel->width = get_relation_data_width(tableOid, NULL);
  
         root->total_table_pages = rel->pages;
  
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c

index 7ffa11588d7d2fc5b9b6131ff4f3d131b7fc15be..aafaf843fcc290a971589324c261a6fe08eb4c98 100644 (file)
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -322,7 +322,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
   * estimate_rel_size - estimate # pages and # tuples in a table or index
   *
   * If attr_widths isn't NULL, it points to the zero-index entry of the
- * relation's attr_width[] cache; we fill this in if we have need to compute
+ * relation's attr_widths[] cache; we fill this in if we have need to compute
   * the attribute widths for estimation purposes.
   */
  void
@@ -435,8 +435,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
   * get_rel_data_width
   *
   * Estimate the average width of (the data part of) the relation's tuples.
- * If attr_widths isn't NULL, also store per-column width estimates into
- * that array.
+ *
+ * If attr_widths isn't NULL, it points to the zero-index entry of the
+ * relation's attr_widths[] cache; use and update that cache as appropriate.
   *
   * Currently we ignore dropped columns.  Ideally those should be included
   * in the result, but we haven't got any way to get info about them; and
@@ -456,6 +457,14 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
  
                 if (att->attisdropped)
                         continue;
+
+               /* use previously cached data, if any */
+               if (attr_widths != NULL && attr_widths[i] > 0)
+               {
+                       tuple_width += attr_widths[i];
+                       continue;
+               }
+
                 /* This should match set_rel_width() in costsize.c */
                 item_width = get_attavgwidth(RelationGetRelid(rel), i);
                 if (item_width <= 0)
@@ -474,10 +483,11 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
  /*
   * get_relation_data_width
   *
- * External API for get_rel_data_width
+ * External API for get_rel_data_width: same behavior except we have to
+ * open the relcache entry.
   */
  int32
-get_relation_data_width(Oid relid)
+get_relation_data_width(Oid relid, int32 *attr_widths)
  {
         int32           result;
         Relation        relation;
@@ -485,7 +495,7 @@ get_relation_data_width(Oid relid)
         /* As above, assume relation is already locked */
         relation = heap_open(relid, NoLock);
  
-       result = get_rel_data_width(relation, NULL);
+       result = get_rel_data_width(relation, attr_widths);
  
         heap_close(relation, NoLock);
  
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h

index e1dcd6df140ca5ad22a3d9fefae871e6811efa8b..8df1b95abe7fc7e9e378591a0af4826d9a78a439 100644 (file)
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -121,6 +121,8 @@ extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
                                                    RelOptInfo *inner_rel,
                                                    SpecialJoinInfo *sjinfo,
                                                    List *restrictlist);
+extern void set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
+                                                       PlannerInfo *subroot);
  extern void set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel);
  extern void set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel);
  extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel,
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h

index de7de84cb3b7fd4fc883cfe8e13cb2662ca79cd6..ca7b2c64698c6b0eaef2c548f93fa6d56e92fc82 100644 (file)
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -31,7 +31,7 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
  extern void estimate_rel_size(Relation rel, int32 *attr_widths,
                                   BlockNumber *pages, double *tuples);
  
-extern int32 get_relation_data_width(Oid relid);
+extern int32 get_relation_data_width(Oid relid, int32 *attr_widths);
  
  extern bool relation_excluded_by_constraints(PlannerInfo *root,
                                                                  RelOptInfo *rel, RangeTblEntry *rte);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)
src/backend/optimizer/path/allpaths.c		patch \| blob \| history
src/backend/optimizer/path/costsize.c		patch \| blob \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| history
src/backend/optimizer/util/plancat.c		patch \| blob \| history
src/include/optimizer/cost.h		patch \| blob \| history
src/include/optimizer/plancat.h		patch \| blob \| history