FmgrInfo *opproc, bool isgt,
Datum constval, Oid consttype);
static double eqjoinsel_inner(Oid operator,
- VariableStatData *vardata1, VariableStatData *vardata2,
- RelOptInfo *rel1, RelOptInfo *rel2);
+ VariableStatData *vardata1, VariableStatData *vardata2);
static double eqjoinsel_semi(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2,
- RelOptInfo *rel1, RelOptInfo *rel2);
+ RelOptInfo *inner_rel);
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound);
VariableStatData vardata1;
VariableStatData vardata2;
bool join_is_reversed;
- RelOptInfo *rel1;
- RelOptInfo *rel2;
+ RelOptInfo *inner_rel;
get_join_variables(root, args, sjinfo,
&vardata1, &vardata2, &join_is_reversed);
- /*
- * Identify the join's direct input relations. We use the min lefthand
- * and min righthand as the inputs, even though the join might actually
- * get done with larger input relations. The min inputs are guaranteed to
- * have been formed by now, though, and always using them ensures
- * consistency of estimates.
- */
- if (!join_is_reversed)
- {
- rel1 = find_join_input_rel(root, sjinfo->min_lefthand);
- rel2 = find_join_input_rel(root, sjinfo->min_righthand);
- }
- else
- {
- rel1 = find_join_input_rel(root, sjinfo->min_righthand);
- rel2 = find_join_input_rel(root, sjinfo->min_lefthand);
- }
-
switch (sjinfo->jointype)
{
case JOIN_INNER:
case JOIN_LEFT:
case JOIN_FULL:
- selec = eqjoinsel_inner(operator, &vardata1, &vardata2,
- rel1, rel2);
+ selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
break;
case JOIN_SEMI:
case JOIN_ANTI:
+ /*
+ * Look up the join's inner relation. min_righthand is sufficient
+ * information because neither SEMI nor ANTI joins permit any
+ * reassociation into or out of their RHS, so the righthand will
+ * always be exactly that set of rels.
+ */
+ inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
+
if (!join_is_reversed)
selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
- rel1, rel2);
+ inner_rel);
else
selec = eqjoinsel_semi(get_commutator(operator),
&vardata2, &vardata1,
- rel2, rel1);
+ inner_rel);
break;
default:
/* other values not expected here */
*/
static double
eqjoinsel_inner(Oid operator,
- VariableStatData *vardata1, VariableStatData *vardata2,
- RelOptInfo *rel1, RelOptInfo *rel2)
+ VariableStatData *vardata1, VariableStatData *vardata2)
{
double selec;
double nd1;
* XXX Can we be smarter if we have an MCV list for just one side? It
* seems that if we assume equal distribution for the other side, we
* end up with the same answer anyway.
- *
- * An additional hack we use here is to clamp the nd1 and nd2 values
- * to not more than what we are estimating the input relation sizes to
- * be, providing a crude correction for the selectivity of restriction
- * clauses on those relations. (We don't do that in the other path
- * since there we are comparing the nd values to stats for the whole
- * relations.) We can apply this clamp both with respect to the base
- * relations from which the join variables come, and to the immediate
- * input relations of the current join.
*/
double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
- if (vardata1->rel)
- nd1 = Min(nd1, vardata1->rel->rows);
- nd1 = Min(nd1, rel1->rows);
- if (vardata2->rel)
- nd2 = Min(nd2, vardata2->rel->rows);
- nd2 = Min(nd2, rel2->rows);
-
selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
if (nd1 > nd2)
selec /= nd1;
static double
eqjoinsel_semi(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2,
- RelOptInfo *rel1, RelOptInfo *rel2)
+ RelOptInfo *inner_rel)
{
double selec;
double nd1;
nd1 = get_variable_numdistinct(vardata1);
nd2 = get_variable_numdistinct(vardata2);
+ /*
+ * We clamp nd2 to be not more than what we estimate the inner relation's
+ * size to be. This is intuitively somewhat reasonable since obviously
+ * there can't be more than that many distinct values coming from the
+ * inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
+ * likewise) is that this is the only pathway by which restriction clauses
+ * applied to the inner rel will affect the join result size estimate,
+ * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
+ * only the outer rel's size. If we clamped nd1 we'd be double-counting
+ * the selectivity of outer-rel restrictions.
+ *
+ * We can apply this clamping both with respect to the base relation from
+ * which the join variable comes (if there is just one), and to the
+ * immediate inner input relation of the current join.
+ */
+ if (vardata2->rel)
+ nd2 = Min(nd2, vardata2->rel->rows);
+ nd2 = Min(nd2, inner_rel->rows);
+
if (HeapTupleIsValid(vardata1->statsTuple))
{
stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
uncertainfrac,
uncertain;
int i,
- nmatches;
+ nmatches,
+ clamped_nvalues2;
+
+ /*
+ * The clamping above could have resulted in nd2 being less than
+ * nvalues2; in which case, we assume that precisely the nd2 most
+ * common values in the relation will appear in the join input, and so
+ * compare to only the first nd2 members of the MCV list. Of course
+ * this is frequently wrong, but it's the best bet we can make.
+ */
+ clamped_nvalues2 = Min(nvalues2, nd2);
fmgr_info(get_opcode(operator), &eqproc);
hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
- hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
+ hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
/*
* Note we assume that each MCV will match at most one member of the
{
int j;
- for (j = 0; j < nvalues2; j++)
+ for (j = 0; j < clamped_nvalues2; j++)
{
if (hasmatch2[j])
continue;
{
nd1 -= nmatches;
nd2 -= nmatches;
- if (nd1 <= nd2 || nd2 <= 0)
+ if (nd1 <= nd2 || nd2 < 0)
uncertainfrac = 1.0;
else
uncertainfrac = nd2 / nd1;
if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
{
- if (vardata1->rel)
- nd1 = Min(nd1, vardata1->rel->rows);
- nd1 = Min(nd1, rel1->rows);
- if (vardata2->rel)
- nd2 = Min(nd2, vardata2->rel->rows);
- nd2 = Min(nd2, rel2->rows);
-
- if (nd1 <= nd2 || nd2 <= 0)
+ if (nd1 <= nd2 || nd2 < 0)
selec = 1.0 - nullfrac1;
else
selec = (nd2 / nd1) * (1.0 - nullfrac1);