Fix mergejoin cost estimation so that we consider the statistical ranges of

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 8 Dec 2007 21:05:11 +0000 (21:05 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 8 Dec 2007 21:05:11 +0000 (21:05 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 8 Dec 2007 21:05:11 +0000 (21:05 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 8 Dec 2007 21:05:11 +0000 (21:05 +0000)
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index fc95399b396ff410d8de9f8b4ee695f04a93a214..c1e1651c7989afce8ad7ac0242cf87bc8570cbdb 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.189 2007/11/15 22:25:15 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.190 2007/12/08 21:05:11 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1372,12 +1372,16 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
         double          outer_path_rows = PATH_ROWS(outer_path);
         double          inner_path_rows = PATH_ROWS(inner_path);
         double          outer_rows,
-                               inner_rows;
+                               inner_rows,
+                               outer_skip_rows,
+                               inner_skip_rows;
         double          mergejointuples,
                                 rescannedtuples;
         double          rescanratio;
-       Selectivity outerscansel,
-                               innerscansel;
+       Selectivity outerstartsel,
+                               outerendsel,
+                               innerstartsel,
+                               innerendsel;
         Selectivity joininfactor;
         Path            sort_path;              /* dummy for result of cost_sort */
  
@@ -1444,10 +1448,12 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
          * A merge join will stop as soon as it exhausts either input stream
          * (unless it's an outer join, in which case the outer side has to be
          * scanned all the way anyway).  Estimate fraction of the left and right
-        * inputs that will actually need to be scanned. We use only the first
-        * (most significant) merge clause for this purpose.  Since
-        * mergejoinscansel() is a fairly expensive computation, we cache the
-        * results in the merge clause RestrictInfo.
+        * inputs that will actually need to be scanned.  Likewise, we can
+        * estimate the number of rows that will be skipped before the first
+        * join pair is found, which should be factored into startup cost.
+        * We use only the first (most significant) merge clause for this purpose.
+        * Since mergejoinscansel() is a fairly expensive computation, we cache
+        * the results in the merge clause RestrictInfo.
          */
         if (mergeclauses && path->jpath.jointype != JOIN_FULL)
         {
@@ -1478,37 +1484,61 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
                                                   outer_path->parent->relids))
                 {
                         /* left side of clause is outer */
-                       outerscansel = cache->leftscansel;
-                       innerscansel = cache->rightscansel;
+                       outerstartsel = cache->leftstartsel;
+                       outerendsel = cache->leftendsel;
+                       innerstartsel = cache->rightstartsel;
+                       innerendsel = cache->rightendsel;
                 }
                 else
                 {
                         /* left side of clause is inner */
-                       outerscansel = cache->rightscansel;
-                       innerscansel = cache->leftscansel;
+                       outerstartsel = cache->rightstartsel;
+                       outerendsel = cache->rightendsel;
+                       innerstartsel = cache->leftstartsel;
+                       innerendsel = cache->leftendsel;
                 }
                 if (path->jpath.jointype == JOIN_LEFT)
-                       outerscansel = 1.0;
+               {
+                       outerstartsel = 0.0;
+                       outerendsel = 1.0;
+               }
                 else if (path->jpath.jointype == JOIN_RIGHT)
-                       innerscansel = 1.0;
+               {
+                       innerstartsel = 0.0;
+                       innerendsel = 1.0;
+               }
         }
         else
         {
                 /* cope with clauseless or full mergejoin */
-               outerscansel = innerscansel = 1.0;
+               outerstartsel = innerstartsel = 0.0;
+               outerendsel = innerendsel = 1.0;
         }
  
-       /* convert selectivity to row count; must scan at least one row */
-       outer_rows = clamp_row_est(outer_path_rows * outerscansel);
-       inner_rows = clamp_row_est(inner_path_rows * innerscansel);
+       /*
+        * Convert selectivities to row counts.  We force outer_rows and
+        * inner_rows to be at least 1, but the skip_rows estimates can be zero.
+        */
+       outer_skip_rows = rint(outer_path_rows * outerstartsel);
+       inner_skip_rows = rint(inner_path_rows * innerstartsel);
+       outer_rows = clamp_row_est(outer_path_rows * outerendsel);
+       inner_rows = clamp_row_est(inner_path_rows * innerendsel);
+
+       Assert(outer_skip_rows <= outer_rows);
+       Assert(inner_skip_rows <= inner_rows);
  
         /*
          * Readjust scan selectivities to account for above rounding.  This is
          * normally an insignificant effect, but when there are only a few rows in
          * the inputs, failing to do this makes for a large percentage error.
          */
-       outerscansel = outer_rows / outer_path_rows;
-       innerscansel = inner_rows / inner_path_rows;
+       outerstartsel = outer_skip_rows / outer_path_rows;
+       innerstartsel = inner_skip_rows / inner_path_rows;
+       outerendsel = outer_rows / outer_path_rows;
+       innerendsel = inner_rows / inner_path_rows;
+
+       Assert(outerstartsel <= outerendsel);
+       Assert(innerstartsel <= innerendsel);
  
         /* cost of source data */
  
@@ -1522,14 +1552,18 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
                                   outer_path->parent->width,
                                   -1.0);
                 startup_cost += sort_path.startup_cost;
+               startup_cost += (sort_path.total_cost - sort_path.startup_cost)
+                       * outerstartsel;
                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
-                       * outerscansel;
+                       * (outerendsel - outerstartsel);
         }
         else
         {
                 startup_cost += outer_path->startup_cost;
+               startup_cost += (outer_path->total_cost - outer_path->startup_cost)
+                       * outerstartsel;
                 run_cost += (outer_path->total_cost - outer_path->startup_cost)
-                       * outerscansel;
+                       * (outerendsel - outerstartsel);
         }
  
         if (innersortkeys)                      /* do we need to sort inner? */
@@ -1542,14 +1576,18 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
                                   inner_path->parent->width,
                                   -1.0);
                 startup_cost += sort_path.startup_cost;
+               startup_cost += (sort_path.total_cost - sort_path.startup_cost)
+                       * innerstartsel * rescanratio;
                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
-                       * innerscansel * rescanratio;
+                       * (innerendsel - innerstartsel) * rescanratio;
         }
         else
         {
                 startup_cost += inner_path->startup_cost;
+               startup_cost += (inner_path->total_cost - inner_path->startup_cost)
+                       * innerstartsel * rescanratio;
                 run_cost += (inner_path->total_cost - inner_path->startup_cost)
-                       * innerscansel * rescanratio;
+                       * (innerendsel - innerstartsel) * rescanratio;
         }
  
         /* CPU costs */
@@ -1571,8 +1609,11 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
          * joininfactor.
          */
         startup_cost += merge_qual_cost.startup;
+       startup_cost += merge_qual_cost.per_tuple *
+               (outer_skip_rows + inner_skip_rows * rescanratio);
         run_cost += merge_qual_cost.per_tuple *
-               (outer_rows + inner_rows * rescanratio);
+               ((outer_rows - outer_skip_rows) +
+                (inner_rows - inner_skip_rows) * rescanratio);
  
         /*
          * For each tuple that gets through the mergejoin proper, we charge
@@ -1597,8 +1638,10 @@ cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
  {
         MergeScanSelCache *cache;
         ListCell   *lc;
-       Selectivity leftscansel,
-                               rightscansel;
+       Selectivity leftstartsel,
+                               leftendsel,
+                               rightstartsel,
+                               rightendsel;
         MemoryContext oldcontext;
  
         /* Do we have this result already? */
@@ -1617,8 +1660,10 @@ cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
                                          pathkey->pk_opfamily,
                                          pathkey->pk_strategy,
                                          pathkey->pk_nulls_first,
-                                        &leftscansel,
-                                        &rightscansel);
+                                        &leftstartsel,
+                                        &leftendsel,
+                                        &rightstartsel,
+                                        &rightendsel);
  
         /* Cache the result in suitably long-lived workspace */
         oldcontext = MemoryContextSwitchTo(root->planner_cxt);
@@ -1627,8 +1672,10 @@ cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
         cache->opfamily = pathkey->pk_opfamily;
         cache->strategy = pathkey->pk_strategy;
         cache->nulls_first = pathkey->pk_nulls_first;
-       cache->leftscansel = leftscansel;
-       cache->rightscansel = rightscansel;
+       cache->leftstartsel = leftstartsel;
+       cache->leftendsel = leftendsel;
+       cache->rightstartsel = rightstartsel;
+       cache->rightendsel = rightendsel;
  
         rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
  
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 103f4dc9d7620b8cd3ca62d712e0829ccf4ffb52..0b0d992a3b19d2133c07a747bc5df46304ce17e4 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.241 2007/11/15 22:25:16 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.242 2007/12/08 21:05:11 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -128,8 +128,8 @@ static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
                                                         int rangelo, int rangehi);
  static char *convert_string_datum(Datum value, Oid typid);
  static double convert_timevalue_to_scalar(Datum value, Oid typid);
-static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
-                                        Oid sortop, Datum *max);
+static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata,
+                                        Oid sortop, Datum *min, Datum *max);
  static Selectivity prefix_selectivity(VariableStatData *vardata,
                                    Oid vartype, Oid opfamily, Const *prefixcon);
  static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
@@ -2172,18 +2172,24 @@ icnlikejoinsel(PG_FUNCTION_ARGS)
   * we can estimate how much of the input will actually be read.  This
   * can have a considerable impact on the cost when using indexscans.
   *
+ * Also, we can estimate how much of each input has to be read before the
+ * first join pair is found, which will affect the join's startup time.
+ *
   * clause should be a clause already known to be mergejoinable.  opfamily,
   * strategy, and nulls_first specify the sort ordering being used.
   *
- * *leftscan is set to the fraction of the left-hand variable expected
- * to be scanned (0 to 1), and similarly *rightscan for the right-hand
- * variable.
+ * The outputs are:
+ *             *leftstart is set to the fraction of the left-hand variable expected
+ *              to be scanned before the first join pair is found (0 to 1).
+ *             *leftend is set to the fraction of the left-hand variable expected
+ *              to be scanned before the join terminates (0 to 1).
+ *             *rightstart, *rightend similarly for the right-hand variable.
   */
  void
  mergejoinscansel(PlannerInfo *root, Node *clause,
                                  Oid opfamily, int strategy, bool nulls_first,
-                                Selectivity *leftscan,
-                                Selectivity *rightscan)
+                                Selectivity *leftstart, Selectivity *leftend,
+                                Selectivity *rightstart, Selectivity *rightend)
  {
         Node       *left,
                            *right;
@@ -2196,14 +2202,23 @@ mergejoinscansel(PlannerInfo *root, Node *clause,
         Oid                     opno,
                                 lsortop,
                                 rsortop,
+                               lstatop,
+                               rstatop,
+                               ltop,
                                 leop,
+                               revltop,
                                 revleop;
-       Datum           leftmax,
+       bool            isgt;
+       Datum           leftmin,
+                               leftmax,
+                               rightmin,
                                 rightmax;
         double          selec;
  
         /* Set default results if we can't figure anything out. */
-       *leftscan = *rightscan = 1.0;
+       /* XXX should default "start" fraction be a bit more than 0? */
+       *leftstart = *rightstart = 0.0;
+       *leftend = *rightend = 1.0;
  
         /* Deconstruct the merge clause */
         if (!is_opclause(clause))
@@ -2229,30 +2244,103 @@ mergejoinscansel(PlannerInfo *root, Node *clause,
  
         /*
          * Look up the various operators we need.  If we don't find them all, it
-        * probably means the opfamily is broken, but we cope anyway.
+        * probably means the opfamily is broken, but we just fail silently.
+        *
+        * Note: we expect that pg_statistic histograms will be sorted by the
+        * '<' operator, regardless of which sort direction we are considering.
          */
         switch (strategy)
         {
                 case BTLessStrategyNumber:
-                       lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
-                                                                                 BTLessStrategyNumber);
-                       rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
-                                                                                 BTLessStrategyNumber);
-                       leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
-                                                                          BTLessEqualStrategyNumber);
-                       revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
-                                                                                 BTLessEqualStrategyNumber);
+                       isgt = false;
+                       if (op_lefttype == op_righttype)
+                       {
+                               /* easy case */
+                               ltop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTLessStrategyNumber);
+                               leop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTLessEqualStrategyNumber);
+                               lsortop = ltop;
+                               rsortop = ltop;
+                               lstatop = lsortop;
+                               rstatop = rsortop;
+                               revltop = ltop;
+                               revleop = leop;
+                       }
+                       else
+                       {
+                               ltop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTLessStrategyNumber);
+                               leop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTLessEqualStrategyNumber);
+                               lsortop = get_opfamily_member(opfamily,
+                                                                                         op_lefttype, op_lefttype,
+                                                                                         BTLessStrategyNumber);
+                               rsortop = get_opfamily_member(opfamily,
+                                                                                         op_righttype, op_righttype,
+                                                                                         BTLessStrategyNumber);
+                               lstatop = lsortop;
+                               rstatop = rsortop;
+                               revltop = get_opfamily_member(opfamily,
+                                                                                         op_righttype, op_lefttype,
+                                                                                         BTLessStrategyNumber);
+                               revleop = get_opfamily_member(opfamily,
+                                                                                         op_righttype, op_lefttype,
+                                                                                         BTLessEqualStrategyNumber);
+                       }
                         break;
                 case BTGreaterStrategyNumber:
                         /* descending-order case */
-                       lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
-                                                                                 BTGreaterStrategyNumber);
-                       rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
-                                                                                 BTGreaterStrategyNumber);
-                       leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
-                                                                          BTGreaterEqualStrategyNumber);
-                       revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
-                                                                                 BTGreaterEqualStrategyNumber);
+                       isgt = true;
+                       if (op_lefttype == op_righttype)
+                       {
+                               /* easy case */
+                               ltop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTGreaterStrategyNumber);
+                               leop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTGreaterEqualStrategyNumber);
+                               lsortop = ltop;
+                               rsortop = ltop;
+                               lstatop = get_opfamily_member(opfamily,
+                                                                                         op_lefttype, op_lefttype,
+                                                                                         BTLessStrategyNumber);
+                               rstatop = lstatop;
+                               revltop = ltop;
+                               revleop = leop;
+                       }
+                       else
+                       {
+                               ltop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTGreaterStrategyNumber);
+                               leop = get_opfamily_member(opfamily,
+                                                                                  op_lefttype, op_righttype,
+                                                                                  BTGreaterEqualStrategyNumber);
+                               lsortop = get_opfamily_member(opfamily,
+                                                                                         op_lefttype, op_lefttype,
+                                                                                         BTGreaterStrategyNumber);
+                               rsortop = get_opfamily_member(opfamily,
+                                                                                         op_righttype, op_righttype,
+                                                                                         BTGreaterStrategyNumber);
+                               lstatop = get_opfamily_member(opfamily,
+                                                                                         op_lefttype, op_lefttype,
+                                                                                         BTLessStrategyNumber);
+                               rstatop = get_opfamily_member(opfamily,
+                                                                                         op_righttype, op_righttype,
+                                                                                         BTLessStrategyNumber);
+                               revltop = get_opfamily_member(opfamily,
+                                                                                         op_righttype, op_lefttype,
+                                                                                         BTGreaterStrategyNumber);
+                               revleop = get_opfamily_member(opfamily,
+                                                                                         op_righttype, op_lefttype,
+                                                                                         BTGreaterEqualStrategyNumber);
+                       }
                         break;
                 default:
                         goto fail;                      /* shouldn't get here */
@@ -2260,66 +2348,133 @@ mergejoinscansel(PlannerInfo *root, Node *clause,
  
         if (!OidIsValid(lsortop) ||
                 !OidIsValid(rsortop) ||
+               !OidIsValid(lstatop) ||
+               !OidIsValid(rstatop) ||
+               !OidIsValid(ltop) ||
                 !OidIsValid(leop) ||
+               !OidIsValid(revltop) ||
                 !OidIsValid(revleop))
                 goto fail;                              /* insufficient info in catalogs */
  
-       /* Try to get maximum values of both inputs */
-       if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
-               goto fail;                              /* no max available from stats */
-
-       if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
-               goto fail;                              /* no max available from stats */
+       /* Try to get ranges of both inputs */
+       if (!isgt)
+       {
+               if (!get_variable_range(root, &leftvar, lstatop,
+                                                               &leftmin, &leftmax))
+                       goto fail;                      /* no range available from stats */
+               if (!get_variable_range(root, &rightvar, rstatop,
+                                                               &rightmin, &rightmax))
+                       goto fail;                      /* no range available from stats */
+       }
+       else
+       {
+               /* need to swap the max and min */
+               if (!get_variable_range(root, &leftvar, lstatop,
+                                                               &leftmax, &leftmin))
+                       goto fail;                      /* no range available from stats */
+               if (!get_variable_range(root, &rightvar, rstatop,
+                                                               &rightmax, &rightmin))
+                       goto fail;                      /* no range available from stats */
+       }
  
         /*
          * Now, the fraction of the left variable that will be scanned is the
          * fraction that's <= the right-side maximum value.  But only believe
-        * non-default estimates, else stick with our 1.0.      Also, if the sort
-        * order is nulls-first, we're going to have to read over any nulls too.
+        * non-default estimates, else stick with our 1.0.
          */
-       selec = scalarineqsel(root, leop, false, &leftvar,
+       selec = scalarineqsel(root, leop, isgt, &leftvar,
                                                   rightmax, op_righttype);
         if (selec != DEFAULT_INEQ_SEL)
-       {
-               if (nulls_first && HeapTupleIsValid(leftvar.statsTuple))
-               {
-                       Form_pg_statistic stats;
-
-                       stats = (Form_pg_statistic) GETSTRUCT(leftvar.statsTuple);
-                       selec += stats->stanullfrac;
-                       CLAMP_PROBABILITY(selec);
-               }
-               *leftscan = selec;
-       }
+               *leftend = selec;
  
         /* And similarly for the right variable. */
-       selec = scalarineqsel(root, revleop, false, &rightvar,
+       selec = scalarineqsel(root, revleop, isgt, &rightvar,
                                                   leftmax, op_lefttype);
         if (selec != DEFAULT_INEQ_SEL)
+               *rightend = selec;
+
+       /*
+        * Only one of the two "end" fractions can really be less than 1.0;
+        * believe the smaller estimate and reset the other one to exactly 1.0.
+        * If we get exactly equal estimates (as can easily happen with
+        * self-joins), believe neither.
+        */
+       if (*leftend > *rightend)
+               *leftend = 1.0;
+       else if (*leftend < *rightend)
+               *rightend = 1.0;
+       else
+               *leftend = *rightend = 1.0;
+
+       /*
+        * Also, the fraction of the left variable that will be scanned before
+        * the first join pair is found is the fraction that's < the right-side
+        * minimum value.  But only believe non-default estimates, else stick with
+        * our own default.
+        */
+       selec = scalarineqsel(root, ltop, isgt, &leftvar,
+                                                 rightmin, op_righttype);
+       if (selec != DEFAULT_INEQ_SEL)
+               *leftstart = selec;
+
+       /* And similarly for the right variable. */
+       selec = scalarineqsel(root, revltop, isgt, &rightvar,
+                                                 leftmin, op_lefttype);
+       if (selec != DEFAULT_INEQ_SEL)
+               *rightstart = selec;
+
+       /*
+        * Only one of the two "start" fractions can really be more than zero;
+        * believe the larger estimate and reset the other one to exactly 0.0.
+        * If we get exactly equal estimates (as can easily happen with
+        * self-joins), believe neither.
+        */
+       if (*leftstart < *rightstart)
+               *leftstart = 0.0;
+       else if (*leftstart > *rightstart)
+               *rightstart = 0.0;
+       else
+               *leftstart = *rightstart = 0.0;
+
+       /*
+        * If the sort order is nulls-first, we're going to have to skip over any
+        * nulls too.  These would not have been counted by scalarineqsel, and
+        * we can safely add in this fraction regardless of whether we believe
+        * scalarineqsel's results or not.  But be sure to clamp the sum to 1.0!
+        */
+       if (nulls_first)
         {
-               if (nulls_first && HeapTupleIsValid(rightvar.statsTuple))
-               {
-                       Form_pg_statistic stats;
+               Form_pg_statistic stats;
  
+               if (HeapTupleIsValid(leftvar.statsTuple))
+               {
+                       stats = (Form_pg_statistic) GETSTRUCT(leftvar.statsTuple);
+                       *leftstart += stats->stanullfrac;
+                       CLAMP_PROBABILITY(*leftstart);
+                       *leftend += stats->stanullfrac;
+                       CLAMP_PROBABILITY(*leftend);
+               }
+               if (HeapTupleIsValid(rightvar.statsTuple))
+               {
                         stats = (Form_pg_statistic) GETSTRUCT(rightvar.statsTuple);
-                       selec += stats->stanullfrac;
-                       CLAMP_PROBABILITY(selec);
+                       *rightstart += stats->stanullfrac;
+                       CLAMP_PROBABILITY(*rightstart);
+                       *rightend += stats->stanullfrac;
+                       CLAMP_PROBABILITY(*rightend);
                 }
-               *rightscan = selec;
         }
  
-       /*
-        * Only one of the two fractions can really be less than 1.0; believe the
-        * smaller estimate and reset the other one to exactly 1.0.  If we get
-        * exactly equal estimates (as can easily happen with self-joins), believe
-        * neither.
-        */
-       if (*leftscan > *rightscan)
-               *leftscan = 1.0;
-       else if (*leftscan < *rightscan)
-               *rightscan = 1.0;
-       else
-               *leftscan = *rightscan = 1.0;
+       /* Disbelieve start >= end, just in case that can happen */
+       if (*leftstart >= *leftend)
+       {
+               *leftstart = 0.0;
+               *leftend = 1.0;
+       }
+       if (*rightstart >= *rightend)
+       {
+               *rightstart = 0.0;
+               *rightend = 1.0;
+       }
  
  fail:
         ReleaseVariableStats(leftvar);
@@ -3778,20 +3933,21 @@ get_variable_numdistinct(VariableStatData *vardata)
  }
  
  /*
- * get_variable_maximum
- *             Estimate the maximum value of the specified variable.
- *             If successful, store value in *max and return TRUE.
+ * get_variable_range
+ *             Estimate the minimum and maximum value of the specified variable.
+ *             If successful, store values in *min and *max, and return TRUE.
   *             If no data available, return FALSE.
   *
- * sortop is the "<" comparison operator to use.  (To extract the
- * minimum instead of the maximum, just pass the ">" operator instead.)
+ * sortop is the "<" comparison operator to use.  This should generally
+ * be "<" not ">", as only the former is likely to be found in pg_statistic.
   */
  static bool
-get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
-                                        Oid sortop, Datum *max)
+get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop,
+                                  Datum *min, Datum *max)
  {
+       Datum           tmin = 0;
         Datum           tmax = 0;
-       bool            have_max = false;
+       bool            have_data = false;
         Form_pg_statistic stats;
         int16           typLen;
         bool            typByVal;
@@ -3809,7 +3965,7 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
  
         /*
-        * If there is a histogram, grab the last or first value as appropriate.
+        * If there is a histogram, grab the first and last values.
          *
          * If there is a histogram that is sorted with some other operator than
          * the one we want, fail --- this suggests that there is data we can't
@@ -3823,42 +3979,24 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
         {
                 if (nvalues > 0)
                 {
+                       tmin = datumCopy(values[0], typByVal, typLen);
                         tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
-                       have_max = true;
+                       have_data = true;
                 }
                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
         }
-       else
+       else if (get_attstatsslot(vardata->statsTuple,
+                                                         vardata->atttype, vardata->atttypmod,
+                                                         STATISTIC_KIND_HISTOGRAM, InvalidOid,
+                                                         &values, &nvalues,
+                                                         NULL, NULL))
         {
-               Oid                     rsortop = get_commutator(sortop);
-
-               if (OidIsValid(rsortop) &&
-                       get_attstatsslot(vardata->statsTuple,
-                                                        vardata->atttype, vardata->atttypmod,
-                                                        STATISTIC_KIND_HISTOGRAM, rsortop,
-                                                        &values, &nvalues,
-                                                        NULL, NULL))
-               {
-                       if (nvalues > 0)
-                       {
-                               tmax = datumCopy(values[0], typByVal, typLen);
-                               have_max = true;
-                       }
-                       free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
-               }
-               else if (get_attstatsslot(vardata->statsTuple,
-                                                                 vardata->atttype, vardata->atttypmod,
-                                                                 STATISTIC_KIND_HISTOGRAM, InvalidOid,
-                                                                 &values, &nvalues,
-                                                                 NULL, NULL))
-               {
-                       free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
-                       return false;
-               }
+               free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+               return false;
         }
  
         /*
-        * If we have most-common-values info, look for a large MCV.  This is
+        * If we have most-common-values info, look for extreme MCVs.  This is
          * needed even if we also have a histogram, since the histogram excludes
          * the MCVs.  However, usually the MCVs will not be the extreme values, so
          * avoid unnecessary data copying.
@@ -3869,31 +4007,41 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
                                                  &values, &nvalues,
                                                  NULL, NULL))
         {
-               bool            large_mcv = false;
+               bool            tmin_is_mcv = false;
+               bool            tmax_is_mcv = false;
                 FmgrInfo        opproc;
  
                 fmgr_info(get_opcode(sortop), &opproc);
  
                 for (i = 0; i < nvalues; i++)
                 {
-                       if (!have_max)
+                       if (!have_data)
                         {
-                               tmax = values[i];
-                               large_mcv = have_max = true;
+                               tmin = tmax = values[i];
+                               tmin_is_mcv = tmax_is_mcv = have_data = true;
+                               continue;
+                       }
+                       if (DatumGetBool(FunctionCall2(&opproc, values[i], tmin)))
+                       {
+                               tmin = values[i];
+                               tmin_is_mcv = true;
                         }
-                       else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
+                       if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
                         {
                                 tmax = values[i];
-                               large_mcv = true;
+                               tmax_is_mcv = true;
                         }
                 }
-               if (large_mcv)
+               if (tmin_is_mcv)
+                       tmin = datumCopy(tmin, typByVal, typLen);
+               if (tmax_is_mcv)
                         tmax = datumCopy(tmax, typByVal, typLen);
                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
         }
  
+       *min = tmin;
         *max = tmax;
-       return have_max;
+       return have_data;
  }
  
  
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h

index 2dce223d3a7a2b162fa64c488d6bfdd79d5bc9e4..e82b00cdd46649a7cde81663b9abdb305d00d32e 100644 (file)
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.150 2007/11/15 22:25:17 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.151 2007/12/08 21:05:11 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -993,8 +993,10 @@ typedef struct MergeScanSelCache
         int                     strategy;               /* sort direction (ASC or DESC) */
         bool            nulls_first;    /* do NULLs come before normal values? */
         /* Results */
-       Selectivity leftscansel;        /* scan fraction for clause left side */
-       Selectivity rightscansel;       /* scan fraction for clause right side */
+       Selectivity leftstartsel;       /* first-join fraction for clause left side */
+       Selectivity leftendsel;         /* last-join fraction for clause left side */
+       Selectivity rightstartsel;      /* first-join fraction for clause right side */
+       Selectivity rightendsel;        /* last-join fraction for clause right side */
  } MergeScanSelCache;
  
  /*
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index f92bb16d07981fab39ffd5477af6284da122657f..ededfdf3c6293455db92ad5c4d23d89b3f81393f 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.41 2007/11/07 22:37:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.42 2007/12/08 21:05:11 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -161,8 +161,8 @@ extern Selectivity rowcomparesel(PlannerInfo *root,
  
  extern void mergejoinscansel(PlannerInfo *root, Node *clause,
                                  Oid opfamily, int strategy, bool nulls_first,
-                                Selectivity *leftscan,
-                                Selectivity *rightscan);
+                                Selectivity *leftstart, Selectivity *leftend,
+                                Selectivity *rightstart, Selectivity *rightend);
  
  extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
                                         double input_rows);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 8 Dec 2007 21:05:11 +0000 (21:05 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 8 Dec 2007 21:05:11 +0000 (21:05 +0000)
src/backend/optimizer/path/costsize.c		patch \| blob \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| history
src/include/nodes/relation.h		patch \| blob \| history
src/include/utils/selfuncs.h		patch \| blob \| history