]> granicus.if.org Git - postgresql/commitdiff
Fix neqjoinsel's behavior for semi/anti join cases.
authorTom Lane <tgl@sss.pgh.pa.us>
Thu, 30 Nov 2017 03:00:29 +0000 (22:00 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Thu, 30 Nov 2017 03:00:37 +0000 (22:00 -0500)
Previously, this function estimated the selectivity as 1 minus eqjoinsel()
for the negator equality operator, regardless of join type (I think there
was an expectation that eqjoinsel would handle the join type).  But
actually this is completely wrong for semijoin cases: the fraction of the
LHS that has a non-matching row is not one minus the fraction of the LHS
that has a matching row.  In reality a semijoin with <> will nearly always
succeed: it can only fail when the RHS is empty, or it contains a single
distinct value that is equal to the particular LHS value, or the LHS value
is null.  The only one of those things we should have much confidence in
estimating is the fraction of LHS values that are null, so let's just take
the selectivity as 1 minus outer nullfrac.

Per coding convention, antijoin should be estimated the same as semijoin.

Arguably this is a bug fix, but in view of the lack of field complaints
and the risk of destabilizing plans, no back-patch.

Thomas Munro, reviewed by Ashutosh Bapat

Discussion: https://postgr.es/m/CAEepm=270ze2hVxWkJw-5eKzc3AB4C9KpH3L2kih75R5pdSogg@mail.gmail.com

src/backend/utils/adt/selfuncs.c
src/test/regress/expected/join.out
src/test/regress/sql/join.sql

index edff6da41090f81b64d8e4338840d368a64b779d..ea95b8068d9f9478557bed2a860bd895486d3f7f 100644 (file)
@@ -2767,29 +2767,67 @@ neqjoinsel(PG_FUNCTION_ARGS)
        List       *args = (List *) PG_GETARG_POINTER(2);
        JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
        SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
-       Oid                     eqop;
        float8          result;
 
-       /*
-        * We want 1 - eqjoinsel() where the equality operator is the one
-        * associated with this != operator, that is, its negator.
-        */
-       eqop = get_negator(operator);
-       if (eqop)
+       if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
        {
-               result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
-                                                                                                       PointerGetDatum(root),
-                                                                                                       ObjectIdGetDatum(eqop),
-                                                                                                       PointerGetDatum(args),
-                                                                                                       Int16GetDatum(jointype),
-                                                                                                       PointerGetDatum(sjinfo)));
+               /*
+                * For semi-joins, if there is more than one distinct value in the RHS
+                * relation then every non-null LHS row must find a row to join since
+                * it can only be equal to one of them.  We'll assume that there is
+                * always more than one distinct RHS value for the sake of stability,
+                * though in theory we could have special cases for empty RHS
+                * (selectivity = 0) and single-distinct-value RHS (selectivity =
+                * fraction of LHS that has the same value as the single RHS value).
+                *
+                * For anti-joins, if we use the same assumption that there is more
+                * than one distinct key in the RHS relation, then every non-null LHS
+                * row must be suppressed by the anti-join.
+                *
+                * So either way, the selectivity estimate should be 1 - nullfrac.
+                */
+               VariableStatData leftvar;
+               VariableStatData rightvar;
+               bool            reversed;
+               HeapTuple       statsTuple;
+               double          nullfrac;
+
+               get_join_variables(root, args, sjinfo, &leftvar, &rightvar, &reversed);
+               statsTuple = reversed ? rightvar.statsTuple : leftvar.statsTuple;
+               if (HeapTupleIsValid(statsTuple))
+                       nullfrac = ((Form_pg_statistic) GETSTRUCT(statsTuple))->stanullfrac;
+               else
+                       nullfrac = 0.0;
+               ReleaseVariableStats(leftvar);
+               ReleaseVariableStats(rightvar);
+
+               result = 1.0 - nullfrac;
        }
        else
        {
-               /* Use default selectivity (should we raise an error instead?) */
-               result = DEFAULT_EQ_SEL;
+               /*
+                * We want 1 - eqjoinsel() where the equality operator is the one
+                * associated with this != operator, that is, its negator.
+                */
+               Oid                     eqop = get_negator(operator);
+
+               if (eqop)
+               {
+                       result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
+                                                                                                               PointerGetDatum(root),
+                                                                                                               ObjectIdGetDatum(eqop),
+                                                                                                               PointerGetDatum(args),
+                                                                                                               Int16GetDatum(jointype),
+                                                                                                               PointerGetDatum(sjinfo)));
+               }
+               else
+               {
+                       /* Use default selectivity (should we raise an error instead?) */
+                       result = DEFAULT_EQ_SEL;
+               }
+               result = 1.0 - result;
        }
-       result = 1.0 - result;
+
        PG_RETURN_FLOAT8(result);
 }
 
index f02ef3f9787c2420ed31fea166e5c7c128d2ec36..b7d1790097869fbc2bee7f54cad7ed04c1d632be 100644 (file)
@@ -1845,6 +1845,28 @@ SELECT '' AS "xxx", *
      | 1 | 4 | one | -1
 (1 row)
 
+--
+-- semijoin selectivity for <>
+--
+explain (costs off)
+select * from int4_tbl i4, tenk1 a
+where exists(select * from tenk1 b
+             where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
+      and i4.f1 = a.tenthous;
+                  QUERY PLAN                  
+----------------------------------------------
+ Hash Semi Join
+   Hash Cond: (a.twothousand = b.twothousand)
+   Join Filter: (a.fivethous <> b.fivethous)
+   ->  Hash Join
+         Hash Cond: (a.tenthous = i4.f1)
+         ->  Seq Scan on tenk1 a
+         ->  Hash
+               ->  Seq Scan on int4_tbl i4
+   ->  Hash
+         ->  Seq Scan on tenk1 b
+(10 rows)
+
 --
 -- More complicated constructs
 --
index cbb71c5b1bea7e19c0318db158192dccf5435ded..c6d4a513e86e7497b1715c666455ddceb69a94e4 100644 (file)
@@ -193,6 +193,15 @@ SELECT '' AS "xxx", *
 SELECT '' AS "xxx", *
   FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1);
 
+--
+-- semijoin selectivity for <>
+--
+explain (costs off)
+select * from int4_tbl i4, tenk1 a
+where exists(select * from tenk1 b
+             where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
+      and i4.f1 = a.tenthous;
+
 
 --
 -- More complicated constructs