]> granicus.if.org Git - postgresql/commitdiff
Improve relation width estimation for subqueries.
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 19 Nov 2010 22:31:50 +0000 (17:31 -0500)
As per the ancient comment for set_rel_width, it really wasn't much good
for relations that aren't plain tables: it would never find any stats and
would always fall back on datatype-based estimates, which are often pretty
silly.  Fix that by copying up width estimates from the subquery planning
process.

At some point we might want to do this for CTEs too, but that would be a
significantly more invasive patch because the sub-PlannerInfo is no longer
accessible by the time it's needed.  I refrained from doing anything about
that, partly for fear of breaking the unmerged CTE-related patches.

In passing, also generate less bogus width estimates for whole-row Vars.

Per a gripe from Jon Nelson.

src/backend/optimizer/path/allpaths.c
src/backend/optimizer/path/costsize.c
src/backend/optimizer/plan/planner.c
src/backend/optimizer/util/plancat.c
src/include/optimizer/cost.h
src/include/optimizer/plancat.h

index aa9a90cbfa2e67ef572b683f9b8938725ff271b8..ce893a77be79134b2de6039a610194496be22cfd 100644 (file)
@@ -758,11 +758,8 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
        rel->subrtable = subroot->parse->rtable;
        rel->subrowmark = subroot->rowMarks;
 
-       /* Copy number of output rows from subplan */
-       rel->tuples = rel->subplan->plan_rows;
-
        /* Mark rel with estimated output rows, width, etc */
-       set_baserel_size_estimates(root, rel);
+       set_subquery_size_estimates(root, rel, subroot);
 
        /* Convert subquery pathkeys to outer representation */
        pathkeys = convert_subquery_pathkeys(root, rel, subroot->query_pathkeys);
index 16a5d0a3ca277a8ad4d735eb43940bc6f46560f7..0724f9a6c9cefdfac57661556b71d24c1b2d94ac 100644 (file)
@@ -76,6 +76,7 @@
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/placeholder.h"
+#include "optimizer/plancat.h"
 #include "optimizer/planmain.h"
 #include "optimizer/restrictinfo.h"
 #include "parser/parsetree.h"
@@ -2986,7 +2987,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
  *             Set the size estimates for the given base relation.
  *
  * The rel's targetlist and restrictinfo list must have been constructed
- * already.
+ * already, and rel->tuples must be set.
  *
  * We set the following fields of the rel node:
  *     rows: the estimated number of output tuples (after applying
@@ -3151,6 +3152,76 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
        rel->rows = clamp_row_est(nrows);
 }
 
+/*
+ * set_subquery_size_estimates
+ *             Set the size estimates for a base relation that is a subquery.
+ *
+ * The rel's targetlist and restrictinfo list must have been constructed
+ * already, and the plan for the subquery must have been completed.
+ * We look at the subquery's plan and PlannerInfo to extract data.
+ *
+ * We set the same fields as set_baserel_size_estimates.
+ */
+void
+set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
+                                                       PlannerInfo *subroot)
+{
+       RangeTblEntry *rte;
+       ListCell   *lc;
+
+       /* Should only be applied to base relations that are subqueries */
+       Assert(rel->relid > 0);
+       rte = planner_rt_fetch(rel->relid, root);
+       Assert(rte->rtekind == RTE_SUBQUERY);
+
+       /* Copy raw number of output rows from subplan */
+       rel->tuples = rel->subplan->plan_rows;
+
+       /*
+        * Compute per-output-column width estimates by examining the subquery's
+        * targetlist.  For any output that is a plain Var, get the width estimate
+        * that was made while planning the subquery.  Otherwise, fall back on a
+        * datatype-based estimate.
+        */
+       foreach(lc, subroot->parse->targetList)
+       {
+               TargetEntry *te = (TargetEntry *) lfirst(lc);
+               Node       *texpr = (Node *) te->expr;
+               int32           item_width;
+
+               Assert(IsA(te, TargetEntry));
+               /* junk columns aren't visible to upper query */
+               if (te->resjunk)
+                       continue;
+
+               /*
+                * XXX This currently doesn't work for subqueries containing set
+                * operations, because the Vars in their tlists are bogus references
+                * to the first leaf subquery, which wouldn't give the right answer
+                * even if we could still get to its PlannerInfo.  So fall back on
+                * datatype in that case.
+                */
+               if (IsA(texpr, Var) &&
+                       subroot->parse->setOperations == NULL)
+               {
+                       Var        *var = (Var *) texpr;
+                       RelOptInfo *subrel = find_base_rel(subroot, var->varno);
+
+                       item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
+               }
+               else
+               {
+                       item_width = get_typavgwidth(exprType(texpr), exprTypmod(texpr));
+               }
+               Assert(item_width > 0);
+               Assert(te->resno >= rel->min_attr && te->resno <= rel->max_attr);
+               rel->attr_widths[te->resno - rel->min_attr] = item_width;
+       }
+
+       /* Now estimate number of output rows, etc */
+       set_baserel_size_estimates(root, rel);
+}
+
 /*
  * set_function_size_estimates
  *             Set the size estimates for a base relation that is a function call.
@@ -3251,11 +3322,17 @@ set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, Plan *cteplan)
  * set_rel_width
  *             Set the estimated output width of a base relation.
  *
+ * The estimated output width is the sum of the per-attribute width estimates
+ * for the actually-referenced columns, plus any PHVs or other expressions
+ * that have to be calculated at this relation.  This is the amount of data
+ * we'd need to pass upwards in case of a sort, hash, etc.
+ *
  * NB: this works best on plain relations because it prefers to look at
- * real Vars.  It will fail to make use of pg_statistic info when applied
- * to a subquery relation, even if the subquery outputs are simple vars
- * that we could have gotten info for. Is it worth trying to be smarter
- * about subqueries?
+ * real Vars.  For subqueries, set_subquery_size_estimates will already have
+ * copied up whatever per-column estimates were made within the subquery,
+ * and for other types of rels there isn't much we can do anyway.  We fall
+ * back on (fairly stupid) datatype-based width estimates if we can't get
+ * any better number.
  *
  * The per-attribute width estimates are cached for possible re-use while
  * building join relations.
@@ -3265,6 +3342,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
 {
        Oid                     reloid = planner_rt_fetch(rel->relid, root)->relid;
        int32           tuple_width = 0;
+       bool            have_wholerow_var = false;
        ListCell   *lc;
 
        foreach(lc, rel->reltargetlist)
@@ -3284,8 +3362,18 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
                        ndx = var->varattno - rel->min_attr;
 
                        /*
-                        * The width probably hasn't been cached yet, but may as well
-                        * check
+                        * If it's a whole-row Var, we'll deal with it below after we
+                        * have already cached as many attr widths as possible.
+                        */
+                       if (var->varattno == 0)
+                       {
+                               have_wholerow_var = true;
+                               continue;
+                       }
+
+                       /*
+                        * The width may have been cached already (especially if it's
+                        * a subquery), so don't duplicate effort.
                         */
                        if (rel->attr_widths[ndx] > 0)
                        {
@@ -3294,7 +3382,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
                        }
 
                        /* Try to get column width from statistics */
-                       if (reloid != InvalidOid)
+                       if (reloid != InvalidOid && var->varattno > 0)
                        {
                                item_width = get_attavgwidth(reloid, var->varattno);
                                if (item_width > 0)
@@ -3335,6 +3423,39 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
                        tuple_width += item_width;
                }
        }
+
+       /*
+        * If we have a whole-row reference, estimate its width as the sum of
+        * per-column widths plus sizeof(HeapTupleHeaderData).
+        */
+       if (have_wholerow_var)
+       {
+               int32   wholerow_width = sizeof(HeapTupleHeaderData);
+
+               if (reloid != InvalidOid)
+               {
+                       /* Real relation, so estimate true tuple width */
+                       wholerow_width += get_relation_data_width(reloid,
+                                                                                                         rel->attr_widths - rel->min_attr);
+               }
+               else
+               {
+                       /* Do what we can with info for a phony rel */
+                       AttrNumber      i;
+
+                       for (i = 1; i <= rel->max_attr; i++)
+                               wholerow_width += rel->attr_widths[i - rel->min_attr];
+               }
+
+               rel->attr_widths[0 - rel->min_attr] = wholerow_width;
+
+               /*
+                * Include the whole-row Var as part of the output tuple.  Yes,
+                * that really is what happens at runtime.
+                */
+               tuple_width += wholerow_width;
+       }
+
        Assert(tuple_width >= 0);
        rel->width = tuple_width;
 }
index 6324bce2403068a70d5495f61fb40fc68de6f9b5..a1e59005921eb9fe7b076c24d8c8c25317d3958e 100644 (file)
@@ -3102,7 +3102,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
         * set_baserel_size_estimates, just do a quick hack for rows and width.
         */
        rel->rows = rel->tuples;
-       rel->width = get_relation_data_width(tableOid);
+       rel->width = get_relation_data_width(tableOid, NULL);
 
        root->total_table_pages = rel->pages;
 
index 7ffa11588d7d2fc5b9b6131ff4f3d131b7fc15be..aafaf843fcc290a971589324c261a6fe08eb4c98 100644 (file)
@@ -322,7 +322,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
  * estimate_rel_size - estimate # pages and # tuples in a table or index
  *
  * If attr_widths isn't NULL, it points to the zero-index entry of the
- * relation's attr_width[] cache; we fill this in if we have need to compute
+ * relation's attr_widths[] cache; we fill this in if we have need to compute
  * the attribute widths for estimation purposes.
  */
 void
@@ -435,8 +435,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
  * get_rel_data_width
  *
  * Estimate the average width of (the data part of) the relation's tuples.
- * If attr_widths isn't NULL, also store per-column width estimates into
- * that array.
+ *
+ * If attr_widths isn't NULL, it points to the zero-index entry of the
+ * relation's attr_widths[] cache; use and update that cache as appropriate.
  *
  * Currently we ignore dropped columns.  Ideally those should be included
  * in the result, but we haven't got any way to get info about them; and
@@ -456,6 +457,14 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
 
                if (att->attisdropped)
                        continue;
+
+               /* use previously cached data, if any */
+               if (attr_widths != NULL && attr_widths[i] > 0)
+               {
+                       tuple_width += attr_widths[i];
+                       continue;
+               }
+
                /* This should match set_rel_width() in costsize.c */
                item_width = get_attavgwidth(RelationGetRelid(rel), i);
                if (item_width <= 0)
@@ -474,10 +483,11 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
 /*
  * get_relation_data_width
  *
- * External API for get_rel_data_width
+ * External API for get_rel_data_width: same behavior except we have to
+ * open the relcache entry.
  */
 int32
-get_relation_data_width(Oid relid)
+get_relation_data_width(Oid relid, int32 *attr_widths)
 {
        int32           result;
        Relation        relation;
@@ -485,7 +495,7 @@ get_relation_data_width(Oid relid)
        /* As above, assume relation is already locked */
        relation = heap_open(relid, NoLock);
 
-       result = get_rel_data_width(relation, NULL);
+       result = get_rel_data_width(relation, attr_widths);
 
        heap_close(relation, NoLock);
 
index e1dcd6df140ca5ad22a3d9fefae871e6811efa8b..8df1b95abe7fc7e9e378591a0af4826d9a78a439 100644 (file)
@@ -121,6 +121,8 @@ extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
                                                   RelOptInfo *inner_rel,
                                                   SpecialJoinInfo *sjinfo,
                                                   List *restrictlist);
+extern void set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
+                                                       PlannerInfo *subroot);
 extern void set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel,
index de7de84cb3b7fd4fc883cfe8e13cb2662ca79cd6..ca7b2c64698c6b0eaef2c548f93fa6d56e92fc82 100644 (file)
@@ -31,7 +31,7 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
 extern void estimate_rel_size(Relation rel, int32 *attr_widths,
                                  BlockNumber *pages, double *tuples);
 
-extern int32 get_relation_data_width(Oid relid);
+extern int32 get_relation_data_width(Oid relid, int32 *attr_widths);
 
 extern bool relation_excluded_by_constraints(PlannerInfo *root,
                                                                 RelOptInfo *rel, RangeTblEntry *rte);