Further repair of eqjoinsel ndistinct-clamping logic.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 1 Sep 2011 04:18:34 +0000 (00:18 -0400)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 1 Sep 2011 04:19:54 +0000 (00:19 -0400)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 1 Sep 2011 04:18:34 +0000 (00:18 -0400)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 1 Sep 2011 04:19:54 +0000 (00:19 -0400)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index dc2a11256f289755e4e731705f14ac134cd137e8..8e304097db8fc6cbfd622ea3161f682eee83c718 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -141,11 +141,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
                                                    FmgrInfo *opproc, bool isgt,
                                                    Datum constval, Oid consttype);
  static double eqjoinsel_inner(Oid operator,
-                               VariableStatData *vardata1, VariableStatData *vardata2,
-                               RelOptInfo *rel1, RelOptInfo *rel2);
+                               VariableStatData *vardata1, VariableStatData *vardata2);
  static double eqjoinsel_semi(Oid operator,
                            VariableStatData *vardata1, VariableStatData *vardata2,
-                          RelOptInfo *rel1, RelOptInfo *rel2);
+                          RelOptInfo *inner_rel);
  static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                                   Datum lobound, Datum hibound, Oid boundstypid,
                                   double *scaledlobound, double *scaledhibound);
@@ -2010,47 +2009,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
         VariableStatData vardata1;
         VariableStatData vardata2;
         bool            join_is_reversed;
-       RelOptInfo *rel1;
-       RelOptInfo *rel2;
+       RelOptInfo *inner_rel;
  
         get_join_variables(root, args, sjinfo,
                                            &vardata1, &vardata2, &join_is_reversed);
  
-       /*
-        * Identify the join's direct input relations.  We use the min lefthand
-        * and min righthand as the inputs, even though the join might actually
-        * get done with larger input relations.  The min inputs are guaranteed to
-        * have been formed by now, though, and always using them ensures
-        * consistency of estimates.
-        */
-       if (!join_is_reversed)
-       {
-               rel1 = find_join_input_rel(root, sjinfo->min_lefthand);
-               rel2 = find_join_input_rel(root, sjinfo->min_righthand);
-       }
-       else
-       {
-               rel1 = find_join_input_rel(root, sjinfo->min_righthand);
-               rel2 = find_join_input_rel(root, sjinfo->min_lefthand);
-       }
-
         switch (sjinfo->jointype)
         {
                 case JOIN_INNER:
                 case JOIN_LEFT:
                 case JOIN_FULL:
-                       selec = eqjoinsel_inner(operator, &vardata1, &vardata2,
-                                                                       rel1, rel2);
+                       selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
                         break;
                 case JOIN_SEMI:
                 case JOIN_ANTI:
+                       /*
+                        * Look up the join's inner relation.  min_righthand is sufficient
+                        * information because neither SEMI nor ANTI joins permit any
+                        * reassociation into or out of their RHS, so the righthand will
+                        * always be exactly that set of rels.
+                        */
+                       inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
+
                         if (!join_is_reversed)
                                 selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
-                                                                          rel1, rel2);
+                                                                          inner_rel);
                         else
                                 selec = eqjoinsel_semi(get_commutator(operator),
                                                                            &vardata2, &vardata1,
-                                                                          rel2, rel1);
+                                                                          inner_rel);
                         break;
                 default:
                         /* other values not expected here */
@@ -2076,8 +2063,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
   */
  static double
  eqjoinsel_inner(Oid operator,
-                               VariableStatData *vardata1, VariableStatData *vardata2,
-                               RelOptInfo *rel1, RelOptInfo *rel2)
+                               VariableStatData *vardata1, VariableStatData *vardata2)
  {
         double          selec;
         double          nd1;
@@ -2272,26 +2258,10 @@ eqjoinsel_inner(Oid operator,
                  * XXX Can we be smarter if we have an MCV list for just one side? It
                  * seems that if we assume equal distribution for the other side, we
                  * end up with the same answer anyway.
-                *
-                * An additional hack we use here is to clamp the nd1 and nd2 values
-                * to not more than what we are estimating the input relation sizes to
-                * be, providing a crude correction for the selectivity of restriction
-                * clauses on those relations.  (We don't do that in the other path
-                * since there we are comparing the nd values to stats for the whole
-                * relations.)  We can apply this clamp both with respect to the base
-                * relations from which the join variables come, and to the immediate
-                * input relations of the current join.
                  */
                 double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
                 double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
  
-               if (vardata1->rel)
-                       nd1 = Min(nd1, vardata1->rel->rows);
-               nd1 = Min(nd1, rel1->rows);
-               if (vardata2->rel)
-                       nd2 = Min(nd2, vardata2->rel->rows);
-               nd2 = Min(nd2, rel2->rows);
-
                 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
                 if (nd1 > nd2)
                         selec /= nd1;
@@ -2318,7 +2288,7 @@ eqjoinsel_inner(Oid operator,
  static double
  eqjoinsel_semi(Oid operator,
                            VariableStatData *vardata1, VariableStatData *vardata2,
-                          RelOptInfo *rel1, RelOptInfo *rel2)
+                          RelOptInfo *inner_rel)
  {
         double          selec;
         double          nd1;
@@ -2338,6 +2308,25 @@ eqjoinsel_semi(Oid operator,
         nd1 = get_variable_numdistinct(vardata1);
         nd2 = get_variable_numdistinct(vardata2);
  
+       /*
+        * We clamp nd2 to be not more than what we estimate the inner relation's
+        * size to be.  This is intuitively somewhat reasonable since obviously
+        * there can't be more than that many distinct values coming from the
+        * inner rel.  The reason for the asymmetry (ie, that we don't clamp nd1
+        * likewise) is that this is the only pathway by which restriction clauses
+        * applied to the inner rel will affect the join result size estimate,
+        * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
+        * only the outer rel's size.  If we clamped nd1 we'd be double-counting
+        * the selectivity of outer-rel restrictions.
+        *
+        * We can apply this clamping both with respect to the base relation from
+        * which the join variable comes (if there is just one), and to the
+        * immediate inner input relation of the current join.
+        */
+       if (vardata2->rel)
+               nd2 = Min(nd2, vardata2->rel->rows);
+       nd2 = Min(nd2, inner_rel->rows);
+
         if (HeapTupleIsValid(vardata1->statsTuple))
         {
                 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
@@ -2381,11 +2370,21 @@ eqjoinsel_semi(Oid operator,
                                         uncertainfrac,
                                         uncertain;
                 int                     i,
-                                       nmatches;
+                                       nmatches,
+                                       clamped_nvalues2;
+
+               /*
+                * The clamping above could have resulted in nd2 being less than
+                * nvalues2; in which case, we assume that precisely the nd2 most
+                * common values in the relation will appear in the join input, and so
+                * compare to only the first nd2 members of the MCV list.  Of course
+                * this is frequently wrong, but it's the best bet we can make.
+                */
+               clamped_nvalues2 = Min(nvalues2, nd2);
  
                 fmgr_info(get_opcode(operator), &eqproc);
                 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
-               hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
+               hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
  
                 /*
                  * Note we assume that each MCV will match at most one member of the
@@ -2398,7 +2397,7 @@ eqjoinsel_semi(Oid operator,
                 {
                         int                     j;
  
-                       for (j = 0; j < nvalues2; j++)
+                       for (j = 0; j < clamped_nvalues2; j++)
                         {
                                 if (hasmatch2[j])
                                         continue;
@@ -2443,7 +2442,7 @@ eqjoinsel_semi(Oid operator,
                 {
                         nd1 -= nmatches;
                         nd2 -= nmatches;
-                       if (nd1 <= nd2 || nd2 <= 0)
+                       if (nd1 <= nd2 || nd2 < 0)
                                 uncertainfrac = 1.0;
                         else
                                 uncertainfrac = nd2 / nd1;
@@ -2464,14 +2463,7 @@ eqjoinsel_semi(Oid operator,
  
                 if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
                 {
-                       if (vardata1->rel)
-                               nd1 = Min(nd1, vardata1->rel->rows);
-                       nd1 = Min(nd1, rel1->rows);
-                       if (vardata2->rel)
-                               nd2 = Min(nd2, vardata2->rel->rows);
-                       nd2 = Min(nd2, rel2->rows);
-
-                       if (nd1 <= nd2 || nd2 <= 0)
+                       if (nd1 <= nd2 || nd2 < 0)
                                 selec = 1.0 - nullfrac1;
                         else
                                 selec = (nd2 / nd1) * (1.0 - nullfrac1);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 1 Sep 2011 04:18:34 +0000 (00:18 -0400)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 1 Sep 2011 04:19:54 +0000 (00:19 -0400)