Make use of statistics on index expressions. There are still some

author Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index c23cf4d23246d5f3dd26bcd8ce02920c702c3eac..79674ac4b94881a74c2efe343459d07c6f40183d 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -49,7 +49,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.124 2004/02/03 17:34:03 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.125 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -102,8 +102,6 @@ bool                enable_mergejoin = true;
  bool           enable_hashjoin = true;
  
  
-static Selectivity estimate_hash_bucketsize(Query *root, Var *var,
-                                                int nbuckets);
  static bool cost_qual_eval_walker(Node *node, QualCost *total);
  static Selectivity approx_selectivity(Query *root, List *quals,
                                    JoinType jointype);
@@ -1152,7 +1150,7 @@ cost_hashjoin(HashPath *path, Query *root)
                                         /* not cached yet */
                                         thisbucketsize =
                                                 estimate_hash_bucketsize(root,
-                                                          (Var *) get_rightop(restrictinfo->clause),
+                                                                                                get_rightop(restrictinfo->clause),
                                                                                                  virtualbuckets);
                                         restrictinfo->right_bucketsize = thisbucketsize;
                                 }
@@ -1168,7 +1166,7 @@ cost_hashjoin(HashPath *path, Query *root)
                                         /* not cached yet */
                                         thisbucketsize =
                                                 estimate_hash_bucketsize(root,
-                                                               (Var *) get_leftop(restrictinfo->clause),
+                                                                                                get_leftop(restrictinfo->clause),
                                                                                                  virtualbuckets);
                                         restrictinfo->left_bucketsize = thisbucketsize;
                                 }
@@ -1249,179 +1247,6 @@ cost_hashjoin(HashPath *path, Query *root)
         path->jpath.path.total_cost = startup_cost + run_cost;
  }
  
-/*
- * Estimate hash bucketsize fraction (ie, number of entries in a bucket
- * divided by total tuples in relation) if the specified Var is used
- * as a hash key.
- *
- * XXX This is really pretty bogus since we're effectively assuming that the
- * distribution of hash keys will be the same after applying restriction
- * clauses as it was in the underlying relation.  However, we are not nearly
- * smart enough to figure out how the restrict clauses might change the
- * distribution, so this will have to do for now.
- *
- * We are passed the number of buckets the executor will use for the given
- * input relation.     If the data were perfectly distributed, with the same
- * number of tuples going into each available bucket, then the bucketsize
- * fraction would be 1/nbuckets.  But this happy state of affairs will occur
- * only if (a) there are at least nbuckets distinct data values, and (b)
- * we have a not-too-skewed data distribution. Otherwise the buckets will
- * be nonuniformly occupied.  If the other relation in the join has a key
- * distribution similar to this one's, then the most-loaded buckets are
- * exactly those that will be probed most often.  Therefore, the "average"
- * bucket size for costing purposes should really be taken as something close
- * to the "worst case" bucket size.  We try to estimate this by adjusting the
- * fraction if there are too few distinct data values, and then scaling up
- * by the ratio of the most common value's frequency to the average frequency.
- *
- * If no statistics are available, use a default estimate of 0.1.  This will
- * discourage use of a hash rather strongly if the inner relation is large,
- * which is what we want.  We do not want to hash unless we know that the
- * inner rel is well-dispersed (or the alternatives seem much worse).
- */
-static Selectivity
-estimate_hash_bucketsize(Query *root, Var *var, int nbuckets)
-{
-       Oid                     relid;
-       RelOptInfo *rel;
-       HeapTuple       tuple;
-       Form_pg_statistic stats;
-       double          estfract,
-                               ndistinct,
-                               mcvfreq,
-                               avgfreq;
-       float4     *numbers;
-       int                     nnumbers;
-
-       /* Ignore any binary-compatible relabeling */
-       if (var && IsA(var, RelabelType))
-               var = (Var *) ((RelabelType *) var)->arg;
-
-       /*
-        * Lookup info about var's relation and attribute; if none available,
-        * return default estimate.
-        */
-       if (var == NULL || !IsA(var, Var))
-               return 0.1;
-
-       relid = getrelid(var->varno, root->rtable);
-       if (relid == InvalidOid)
-               return 0.1;
-
-       rel = find_base_rel(root, var->varno);
-
-       if (rel->tuples <= 0.0 || rel->rows <= 0.0)
-               return 0.1;                             /* ensure we can divide below */
-
-       tuple = SearchSysCache(STATRELATT,
-                                                  ObjectIdGetDatum(relid),
-                                                  Int16GetDatum(var->varattno),
-                                                  0, 0);
-       if (!HeapTupleIsValid(tuple))
-       {
-               /*
-                * If the attribute is known unique because of an index,
-                * we can treat it as well-distributed.
-                */
-               if (has_unique_index(rel, var->varattno))
-                       return 1.0 / (double) nbuckets;
-
-               /*
-                * Perhaps the Var is a system attribute; if so, it will have no
-                * entry in pg_statistic, but we may be able to guess something
-                * about its distribution anyway.
-                */
-               switch (var->varattno)
-               {
-                       case ObjectIdAttributeNumber:
-                       case SelfItemPointerAttributeNumber:
-                               /* these are unique, so buckets should be well-distributed */
-                               return 1.0 / (double) nbuckets;
-                       case TableOidAttributeNumber:
-                               /* hashing this is a terrible idea... */
-                               return 1.0;
-               }
-               return 0.1;
-       }
-       stats = (Form_pg_statistic) GETSTRUCT(tuple);
-
-       /*
-        * Obtain number of distinct data values in raw relation.
-        */
-       ndistinct = stats->stadistinct;
-       if (ndistinct < 0.0)
-               ndistinct = -ndistinct * rel->tuples;
-
-       if (ndistinct <= 0.0)           /* ensure we can divide */
-       {
-               ReleaseSysCache(tuple);
-               return 0.1;
-       }
-
-       /* Also compute avg freq of all distinct data values in raw relation */
-       avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
-
-       /*
-        * Adjust ndistinct to account for restriction clauses.  Observe we
-        * are assuming that the data distribution is affected uniformly by
-        * the restriction clauses!
-        *
-        * XXX Possibly better way, but much more expensive: multiply by
-        * selectivity of rel's restriction clauses that mention the target
-        * Var.
-        */
-       ndistinct *= rel->rows / rel->tuples;
-
-       /*
-        * Initial estimate of bucketsize fraction is 1/nbuckets as long as
-        * the number of buckets is less than the expected number of distinct
-        * values; otherwise it is 1/ndistinct.
-        */
-       if (ndistinct > (double) nbuckets)
-               estfract = 1.0 / (double) nbuckets;
-       else
-               estfract = 1.0 / ndistinct;
-
-       /*
-        * Look up the frequency of the most common value, if available.
-        */
-       mcvfreq = 0.0;
-
-       if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
-                                                STATISTIC_KIND_MCV, InvalidOid,
-                                                NULL, NULL, &numbers, &nnumbers))
-       {
-               /*
-                * The first MCV stat is for the most common value.
-                */
-               if (nnumbers > 0)
-                       mcvfreq = numbers[0];
-               free_attstatsslot(var->vartype, NULL, 0,
-                                                 numbers, nnumbers);
-       }
-
-       /*
-        * Adjust estimated bucketsize upward to account for skewed
-        * distribution.
-        */
-       if (avgfreq > 0.0 && mcvfreq > avgfreq)
-               estfract *= mcvfreq / avgfreq;
-
-       /*
-        * Clamp bucketsize to sane range (the above adjustment could easily
-        * produce an out-of-range result).  We set the lower bound a little
-        * above zero, since zero isn't a very sane result.
-        */
-       if (estfract < 1.0e-6)
-               estfract = 1.0e-6;
-       else if (estfract > 1.0)
-               estfract = 1.0;
-
-       ReleaseSysCache(tuple);
-
-       return (Selectivity) estfract;
-}
-
  
  /*
   * cost_qual_eval
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c

index d6d093ea467666acddfbbbfde4f4114c50eac3c4..d5a5480c62e94a3c37411c724db60b9691ff836c 100644 (file)
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.54 2003/12/08 18:19:58 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.55 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -214,12 +214,8 @@ find_base_rel(Query *root, int relid)
   * find_join_rel
   *       Returns relation entry corresponding to 'relids' (a set of RT indexes),
   *       or NULL if none exists.  This is for join relations.
- *
- * Note: there is probably no good reason for this to be called from
- * anywhere except build_join_rel, but keep it as a separate routine
- * just in case.
   */
-static RelOptInfo *
+RelOptInfo *
  find_join_rel(Query *root, Relids relids)
  {
         List       *joinrels;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 84f18dc935993a70a20459a4bfe4df0314223f05..054739140970a5de8805e59ee5f37e293d2778bc 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.156 2004/02/02 03:07:08 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.157 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -136,7 +136,6 @@
  /* default selectivity estimate for boolean and null test nodes */
  #define DEFAULT_UNK_SEL                        0.005
  #define DEFAULT_NOT_UNK_SEL            (1.0 - DEFAULT_UNK_SEL)
-#define DEFAULT_BOOL_SEL               0.5
  
  /*
   * Clamp a computed probability estimate (which may suffer from roundoff or
@@ -151,7 +150,25 @@
         } while (0)
  
  
-static bool get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max);
+/* Return data from examine_variable and friends */
+typedef struct
+{
+       Node       *var;                        /* the Var or expression tree */
+       RelOptInfo *rel;                        /* Relation, or NULL if not identifiable */
+       HeapTuple       statsTuple;             /* pg_statistic tuple, or NULL if none */
+       /* NB: if statsTuple!=NULL, it must be freed when caller is done */
+       Oid                     atttype;                /* type to pass to get_attstatsslot */
+       int32           atttypmod;              /* typmod to pass to get_attstatsslot */
+       bool            isunique;               /* true if matched to a unique index */
+} VariableStatData;
+
+#define ReleaseVariableStats(vardata)  \
+       do { \
+               if (HeapTupleIsValid((vardata).statsTuple)) \
+                       ReleaseSysCache((vardata).statsTuple); \
+       } while(0)
+
+
  static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                                   Datum lobound, Datum hibound, Oid boundstypid,
                                   double *scaledlobound, double *scaledhibound);
@@ -174,13 +191,18 @@ static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
                                                         int rangelo, int rangehi);
  static unsigned char *convert_string_datum(Datum value, Oid typid);
  static double convert_timevalue_to_scalar(Datum value, Oid typid);
-static double get_att_numdistinct(Query *root, Var *var,
-                                       Form_pg_statistic stats);
-static bool get_restriction_var(List *args, int varRelid,
-                                       Var **var, Node **other,
+static bool get_restriction_variable(Query *root, List *args, int varRelid,
+                                       VariableStatData *vardata, Node **other,
                                         bool *varonleft);
-static void get_join_vars(List *args, Var **var1, Var **var2);
-static Selectivity prefix_selectivity(Query *root, Var *var,
+static void get_join_variables(Query *root, List *args,
+                                                          VariableStatData *vardata1,
+                                                          VariableStatData *vardata2);
+static void examine_variable(Query *root, Node *node, int varRelid,
+                                                        VariableStatData *vardata);
+static double get_variable_numdistinct(VariableStatData *vardata);
+static bool get_variable_maximum(Query *root, VariableStatData *vardata,
+                                                                Oid sortop, Datum *max);
+static Selectivity prefix_selectivity(Query *root, VariableStatData *vardata,
                                    Oid opclass, Const *prefix);
  static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
  static Datum string_to_datum(const char *str, Oid datatype);
@@ -203,11 +225,9 @@ eqsel(PG_FUNCTION_ARGS)
         Oid                     operator = PG_GETARG_OID(1);
         List       *args = (List *) PG_GETARG_POINTER(2);
         int                     varRelid = PG_GETARG_INT32(3);
-       Var                *var;
+       VariableStatData vardata;
         Node       *other;
         bool            varonleft;
-       Oid                     relid;
-       HeapTuple       statsTuple;
         Datum      *values;
         int                     nvalues;
         float4     *numbers;
@@ -215,15 +235,11 @@ eqsel(PG_FUNCTION_ARGS)
         double          selec;
  
         /*
-        * If expression is not var = something or something = var for a
-        * simple var of a real relation (no subqueries, for now), then punt
-        * and return a default estimate.
+        * If expression is not variable = something or something = variable,
+        * then punt and return a default estimate.
          */
-       if (!get_restriction_var(args, varRelid,
-                                                        &var, &other, &varonleft))
-               PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
-       relid = getrelid(var->varno, root->rtable);
-       if (relid == InvalidOid)
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, &other, &varonleft))
                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
  
         /*
@@ -232,22 +248,20 @@ eqsel(PG_FUNCTION_ARGS)
          */
         if (IsA(other, Const) &&
                 ((Const *) other)->constisnull)
+       {
+               ReleaseVariableStats(vardata);
                 PG_RETURN_FLOAT8(0.0);
+       }
  
-       /* get stats for the attribute, if available */
-       statsTuple = SearchSysCache(STATRELATT,
-                                                               ObjectIdGetDatum(relid),
-                                                               Int16GetDatum(var->varattno),
-                                                               0, 0);
-       if (HeapTupleIsValid(statsTuple))
+       if (HeapTupleIsValid(vardata.statsTuple))
         {
                 Form_pg_statistic stats;
  
-               stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
+               stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
  
                 if (IsA(other, Const))
                 {
-                       /* Var is being compared to a known non-null constant */
+                       /* Variable is being compared to a known non-null constant */
                         Datum           constval = ((Const *) other)->constvalue;
                         bool            match = false;
                         int                     i;
@@ -259,7 +273,8 @@ eqsel(PG_FUNCTION_ARGS)
                          * an appropriate test.  If you don't like this, maybe you
                          * shouldn't be using eqsel for your operator...)
                          */
-                       if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+                       if (get_attstatsslot(vardata.statsTuple,
+                                                                vardata.atttype, vardata.atttypmod,
                                                                  STATISTIC_KIND_MCV, InvalidOid,
                                                                  &values, &nvalues,
                                                                  &numbers, &nnumbers))
@@ -321,7 +336,7 @@ eqsel(PG_FUNCTION_ARGS)
                                  * remaining fraction equally, so we divide by the number
                                  * of other distinct values.
                                  */
-                               otherdistinct = get_att_numdistinct(root, var, stats)
+                               otherdistinct = get_variable_numdistinct(&vardata)
                                         - nnumbers;
                                 if (otherdistinct > 1)
                                         selec /= otherdistinct;
@@ -334,7 +349,7 @@ eqsel(PG_FUNCTION_ARGS)
                                         selec = numbers[nnumbers - 1];
                         }
  
-                       free_attstatsslot(var->vartype, values, nvalues,
+                       free_attstatsslot(vardata.atttype, values, nvalues,
                                                           numbers, nnumbers);
                 }
                 else
@@ -352,7 +367,7 @@ eqsel(PG_FUNCTION_ARGS)
                          * frequency in the table.      Is that a good idea?)
                          */
                         selec = 1.0 - stats->stanullfrac;
-                       ndistinct = get_att_numdistinct(root, var, stats);
+                       ndistinct = get_variable_numdistinct(&vardata);
                         if (ndistinct > 1)
                                 selec /= ndistinct;
  
@@ -360,18 +375,17 @@ eqsel(PG_FUNCTION_ARGS)
                          * Cross-check: selectivity should never be estimated as more
                          * than the most common value's.
                          */
-                       if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+                       if (get_attstatsslot(vardata.statsTuple,
+                                                                vardata.atttype, vardata.atttypmod,
                                                                  STATISTIC_KIND_MCV, InvalidOid,
                                                                  NULL, NULL,
                                                                  &numbers, &nnumbers))
                         {
                                 if (nnumbers > 0 && selec > numbers[0])
                                         selec = numbers[0];
-                               free_attstatsslot(var->vartype, NULL, 0, numbers, nnumbers);
+                               free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
                         }
                 }
-
-               ReleaseSysCache(statsTuple);
         }
         else
         {
@@ -381,9 +395,11 @@ eqsel(PG_FUNCTION_ARGS)
                  * equally common.      (The guess is unlikely to be very good, but we
                  * do know a few special cases.)
                  */
-               selec = 1.0 / get_att_numdistinct(root, var, NULL);
+               selec = 1.0 / get_variable_numdistinct(&vardata);
         }
  
+       ReleaseVariableStats(vardata);
+
         /* result should be in range, but make sure... */
         CLAMP_PROBABILITY(selec);
  
@@ -433,7 +449,7 @@ neqsel(PG_FUNCTION_ARGS)
   *     scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
   *
   * This is the guts of both scalarltsel and scalargtsel.  The caller has
- * commuted the clause, if necessary, so that we can treat the Var as
+ * commuted the clause, if necessary, so that we can treat the variable as
   * being on the left.  The caller must also make sure that the other side
   * of the clause is a non-null Const, and dissect same into a value and
   * datatype.
@@ -444,10 +460,8 @@ neqsel(PG_FUNCTION_ARGS)
   */
  static double
  scalarineqsel(Query *root, Oid operator, bool isgt,
-                         Var *var, Datum constval, Oid consttype)
+                         VariableStatData *vardata, Datum constval, Oid consttype)
  {
-       Oid                     relid;
-       HeapTuple       statsTuple;
         Form_pg_statistic stats;
         FmgrInfo        opproc;
         Datum      *values;
@@ -460,26 +474,12 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
         double          selec;
         int                     i;
  
-       /*
-        * If expression is not var op something or something op var for a
-        * simple var of a real relation (no subqueries, for now), then punt
-        * and return a default estimate.
-        */
-       relid = getrelid(var->varno, root->rtable);
-       if (relid == InvalidOid)
-               return DEFAULT_INEQ_SEL;
-
-       /* get stats for the attribute */
-       statsTuple = SearchSysCache(STATRELATT,
-                                                               ObjectIdGetDatum(relid),
-                                                               Int16GetDatum(var->varattno),
-                                                               0, 0);
-       if (!HeapTupleIsValid(statsTuple))
+       if (!HeapTupleIsValid(vardata->statsTuple))
         {
                 /* no stats available, so default result */
                 return DEFAULT_INEQ_SEL;
         }
-       stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
+       stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
  
         fmgr_info(get_opcode(operator), &opproc);
  
@@ -492,7 +492,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
         mcv_selec = 0.0;
         sumcommon = 0.0;
  
-       if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+       if (get_attstatsslot(vardata->statsTuple,
+                                                vardata->atttype, vardata->atttypmod,
                                                  STATISTIC_KIND_MCV, InvalidOid,
                                                  &values, &nvalues,
                                                  &numbers, &nnumbers))
@@ -505,7 +506,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
                                 mcv_selec += numbers[i];
                         sumcommon += numbers[i];
                 }
-               free_attstatsslot(var->vartype, values, nvalues, numbers, nnumbers);
+               free_attstatsslot(vardata->atttype, values, nvalues,
+                                                 numbers, nnumbers);
         }
  
         /*
@@ -523,7 +525,8 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
          */
         hist_selec = 0.0;
  
-       if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+       if (get_attstatsslot(vardata->statsTuple,
+                                                vardata->atttype, vardata->atttypmod,
                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
                                                  &values, &nvalues,
                                                  NULL, NULL))
@@ -582,7 +585,7 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
                                          */
                                         if (convert_to_scalar(constval, consttype, &val,
                                                                                   values[i - 1], values[i],
-                                                                                 var->vartype,
+                                                                                 vardata->atttype,
                                                                                   &low, &high))
                                         {
                                                 if (high <= low)
@@ -653,7 +656,7 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
                                 hist_selec = 0.9999;
                 }
  
-               free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
+               free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
         }
  
         /*
@@ -676,8 +679,6 @@ scalarineqsel(Query *root, Oid operator, bool isgt,
  
         selec += mcv_selec;
  
-       ReleaseSysCache(statsTuple);
-
         /* result should be in range, but make sure... */
         CLAMP_PROBABILITY(selec);
  
@@ -694,21 +695,20 @@ scalarltsel(PG_FUNCTION_ARGS)
         Oid                     operator = PG_GETARG_OID(1);
         List       *args = (List *) PG_GETARG_POINTER(2);
         int                     varRelid = PG_GETARG_INT32(3);
-       Var                *var;
+       VariableStatData vardata;
         Node       *other;
+       bool            varonleft;
         Datum           constval;
         Oid                     consttype;
-       bool            varonleft;
         bool            isgt;
         double          selec;
  
         /*
-        * If expression is not var op something or something op var for a
-        * simple var of a real relation (no subqueries, for now), then punt
-        * and return a default estimate.
+        * If expression is not variable op something or something op variable,
+        * then punt and return a default estimate.
          */
-       if (!get_restriction_var(args, varRelid,
-                                                        &var, &other, &varonleft))
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, &other, &varonleft))
                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
  
         /*
@@ -716,14 +716,20 @@ scalarltsel(PG_FUNCTION_ARGS)
          * either.
          */
         if (!IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+       }
  
         /*
          * If the constant is NULL, assume operator is strict and return zero,
          * ie, operator will never return TRUE.
          */
         if (((Const *) other)->constisnull)
+       {
+               ReleaseVariableStats(vardata);
                 PG_RETURN_FLOAT8(0.0);
+       }
         constval = ((Const *) other)->constvalue;
         consttype = ((Const *) other)->consttype;
  
@@ -742,12 +748,15 @@ scalarltsel(PG_FUNCTION_ARGS)
                 if (!operator)
                 {
                         /* Use default selectivity (should we raise an error instead?) */
+                       ReleaseVariableStats(vardata);
                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
                 }
                 isgt = true;
         }
  
-       selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
+       selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
+
+       ReleaseVariableStats(vardata);
  
         PG_RETURN_FLOAT8((float8) selec);
  }
@@ -762,21 +771,20 @@ scalargtsel(PG_FUNCTION_ARGS)
         Oid                     operator = PG_GETARG_OID(1);
         List       *args = (List *) PG_GETARG_POINTER(2);
         int                     varRelid = PG_GETARG_INT32(3);
-       Var                *var;
+       VariableStatData vardata;
         Node       *other;
+       bool            varonleft;
         Datum           constval;
         Oid                     consttype;
-       bool            varonleft;
         bool            isgt;
         double          selec;
  
         /*
-        * If expression is not var op something or something op var for a
-        * simple var of a real relation (no subqueries, for now), then punt
-        * and return a default estimate.
+        * If expression is not variable op something or something op variable,
+        * then punt and return a default estimate.
          */
-       if (!get_restriction_var(args, varRelid,
-                                                        &var, &other, &varonleft))
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, &other, &varonleft))
                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
  
         /*
@@ -784,14 +792,20 @@ scalargtsel(PG_FUNCTION_ARGS)
          * either.
          */
         if (!IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+       }
  
         /*
          * If the constant is NULL, assume operator is strict and return zero,
          * ie, operator will never return TRUE.
          */
         if (((Const *) other)->constisnull)
+       {
+               ReleaseVariableStats(vardata);
                 PG_RETURN_FLOAT8(0.0);
+       }
         constval = ((Const *) other)->constvalue;
         consttype = ((Const *) other)->consttype;
  
@@ -810,12 +824,15 @@ scalargtsel(PG_FUNCTION_ARGS)
                 if (!operator)
                 {
                         /* Use default selectivity (should we raise an error instead?) */
+                       ReleaseVariableStats(vardata);
                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
                 }
                 isgt = false;
         }
  
-       selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
+       selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
+
+       ReleaseVariableStats(vardata);
  
         PG_RETURN_FLOAT8((float8) selec);
  }
@@ -833,10 +850,9 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
  #endif
         List       *args = (List *) PG_GETARG_POINTER(2);
         int                     varRelid = PG_GETARG_INT32(3);
-       Var                *var;
+       VariableStatData vardata;
         Node       *other;
         bool            varonleft;
-       Oid                     relid;
         Datum           constval;
         Oid                     consttype;
         Oid                     vartype;
@@ -848,25 +864,27 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
         double          result;
  
         /*
-        * If expression is not var op constant for a simple var of a real
-        * relation (no subqueries, for now), then punt and return a default
-        * estimate.
+        * If expression is not variable op constant, then punt and return a
+        * default estimate.
          */
-       if (!get_restriction_var(args, varRelid,
-                                                        &var, &other, &varonleft))
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, &other, &varonleft))
                 return DEFAULT_MATCH_SEL;
         if (!varonleft || !IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
                 return DEFAULT_MATCH_SEL;
-       relid = getrelid(var->varno, root->rtable);
-       if (relid == InvalidOid)
-               return DEFAULT_MATCH_SEL;
+       }
  
         /*
          * If the constant is NULL, assume operator is strict and return zero,
          * ie, operator will never return TRUE.
          */
         if (((Const *) other)->constisnull)
+       {
+               ReleaseVariableStats(vardata);
                 return 0.0;
+       }
         constval = ((Const *) other)->constvalue;
         consttype = ((Const *) other)->consttype;
  
@@ -877,14 +895,17 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
          * match the operator's declared type.
          */
         if (consttype != TEXTOID && consttype != BYTEAOID)
+       {
+               ReleaseVariableStats(vardata);
                 return DEFAULT_MATCH_SEL;
+       }
  
         /*
          * The var, on the other hand, might be a binary-compatible type;
          * particularly a domain.  Try to fold it if it's not recognized
          * immediately.
          */
-       vartype = var->vartype;
+       vartype = vardata.atttype;
         if (vartype != consttype)
                 vartype = getBaseType(vartype);
  
@@ -915,6 +936,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
                         opclass = BYTEA_BTREE_OPS_OID;
                         break;
                 default:
+                       ReleaseVariableStats(vardata);
                         return DEFAULT_MATCH_SEL;
         }
  
@@ -943,6 +965,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
                         default:
                                 elog(ERROR, "unrecognized consttype: %u",
                                          prefix->consttype);
+                               ReleaseVariableStats(vardata);
                                 return DEFAULT_MATCH_SEL;
                 }
                 prefix = string_to_const(prefixstr, vartype);
@@ -960,7 +983,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
  
                 if (eqopr == InvalidOid)
                         elog(ERROR, "no = operator for opclass %u", opclass);
-               eqargs = makeList2(var, prefix);
+               eqargs = makeList2(vardata.var, prefix);
                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
                                                                                                         PointerGetDatum(root),
                                                                                                  ObjectIdGetDatum(eqopr),
@@ -979,7 +1002,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
                 Selectivity selec;
  
                 if (pstatus == Pattern_Prefix_Partial)
-                       prefixsel = prefix_selectivity(root, var, opclass, prefix);
+                       prefixsel = prefix_selectivity(root, &vardata, opclass, prefix);
                 else
                         prefixsel = 1.0;
                 restsel = pattern_selectivity(rest, ptype);
@@ -995,6 +1018,8 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
                 pfree(prefix);
         }
  
+       ReleaseVariableStats(vardata);
+
         return result;
  }
  
@@ -1093,80 +1118,25 @@ Selectivity
  booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                         int varRelid, JoinType jointype)
  {
-       Var                *var;
-       Oid                     relid;
-       HeapTuple       statsTuple;
-       Datum      *values;
-       int                     nvalues;
-       float4     *numbers;
-       int                     nnumbers;
+       VariableStatData vardata;
         double          selec;
  
-       /*
-        * Ignore any binary-compatible relabeling (probably unnecessary, but
-        * can't hurt)
-        */
-       if (IsA(arg, RelabelType))
-               arg = (Node *) ((RelabelType *) arg)->arg;
-
-       if (IsA(arg, Var) &&
-               (varRelid == 0 || varRelid == ((Var *) arg)->varno))
-               var = (Var *) arg;
-       else
-       {
-               /*
-                * If argument is not a Var, we can't get statistics for it, but
-                * perhaps clause_selectivity can do something with it.  We ignore
-                * the possibility of a NULL value when using clause_selectivity,
-                * and just assume the value is either TRUE or FALSE.
-                */
-               switch (booltesttype)
-               {
-                       case IS_UNKNOWN:
-                               selec = DEFAULT_UNK_SEL;
-                               break;
-                       case IS_NOT_UNKNOWN:
-                               selec = DEFAULT_NOT_UNK_SEL;
-                               break;
-                       case IS_TRUE:
-                       case IS_NOT_FALSE:
-                               selec = (double) clause_selectivity(root, arg,
-                                                                                                       varRelid, jointype);
-                               break;
-                       case IS_FALSE:
-                       case IS_NOT_TRUE:
-                               selec = 1.0 - (double) clause_selectivity(root, arg,
-                                                                                                        varRelid, jointype);
-                               break;
-                       default:
-                               elog(ERROR, "unrecognized booltesttype: %d",
-                                        (int) booltesttype);
-                               selec = 0.0;    /* Keep compiler quiet */
-                               break;
-               }
-               return (Selectivity) selec;
-       }
-
-       /* get stats for the attribute, if available */
-       relid = getrelid(var->varno, root->rtable);
-       if (relid == InvalidOid)
-               statsTuple = NULL;
-       else
-               statsTuple = SearchSysCache(STATRELATT,
-                                                                       ObjectIdGetDatum(relid),
-                                                                       Int16GetDatum(var->varattno),
-                                                                       0, 0);
+       examine_variable(root, arg, varRelid, &vardata);
  
-       if (HeapTupleIsValid(statsTuple))
+       if (HeapTupleIsValid(vardata.statsTuple))
         {
                 Form_pg_statistic stats;
                 double          freq_null;
+               Datum      *values;
+               int                     nvalues;
+               float4     *numbers;
+               int                     nnumbers;
  
-               stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
-
+               stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
                 freq_null = stats->stanullfrac;
  
-               if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
+               if (get_attstatsslot(vardata.statsTuple,
+                                                        vardata.atttype, vardata.atttypmod,
                                                          STATISTIC_KIND_MCV, InvalidOid,
                                                          &values, &nvalues,
                                                          &numbers, &nnumbers)
@@ -1184,7 +1154,7 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                                 freq_true = 1.0 - numbers[0] - freq_null;
  
                         /*
-                        * Next derive freqency for false. Then use these as
+                        * Next derive frequency for false. Then use these as
                          * appropriate to derive frequency for each case.
                          */
                         freq_false = 1.0 - freq_true - freq_null;
@@ -1222,7 +1192,7 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                                         break;
                         }
  
-                       free_attstatsslot(var->vartype, values, nvalues,
+                       free_attstatsslot(vardata.atttype, values, nvalues,
                                                           numbers, nnumbers);
                 }
                 else
@@ -1263,14 +1233,14 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                                         break;
                         }
                 }
-
-               ReleaseSysCache(statsTuple);
         }
         else
         {
                 /*
-                * No VACUUM ANALYZE stats available, so use a default value.
-                * (Note: not much point in recursing to clause_selectivity here.)
+                * If we can't get variable statistics for the argument, perhaps
+                * clause_selectivity can do something with it.  We ignore
+                * the possibility of a NULL value when using clause_selectivity,
+                * and just assume the value is either TRUE or FALSE.
                  */
                 switch (booltesttype)
                 {
@@ -1281,10 +1251,14 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                                 selec = DEFAULT_NOT_UNK_SEL;
                                 break;
                         case IS_TRUE:
-                       case IS_NOT_TRUE:
-                       case IS_FALSE:
                         case IS_NOT_FALSE:
-                               selec = DEFAULT_BOOL_SEL;
+                               selec = (double) clause_selectivity(root, arg,
+                                                                                                       varRelid, jointype);
+                               break;
+                       case IS_FALSE:
+                       case IS_NOT_TRUE:
+                               selec = 1.0 - (double) clause_selectivity(root, arg,
+                                                                                                                 varRelid, jointype);
                                 break;
                         default:
                                 elog(ERROR, "unrecognized booltesttype: %d",
@@ -1294,6 +1268,8 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
                 }
         }
  
+       ReleaseVariableStats(vardata);
+
         /* result should be in range, but make sure... */
         CLAMP_PROBABILITY(selec);
  
@@ -1306,56 +1282,17 @@ booltestsel(Query *root, BoolTestType booltesttype, Node *arg,
  Selectivity
  nulltestsel(Query *root, NullTestType nulltesttype, Node *arg, int varRelid)
  {
-       Var                *var;
-       Oid                     relid;
-       HeapTuple       statsTuple;
+       VariableStatData vardata;
         double          selec;
-       double          defselec;
-       double          freq_null;
-
-       switch (nulltesttype)
-       {
-               case IS_NULL:
-                       defselec = DEFAULT_UNK_SEL;
-                       break;
-               case IS_NOT_NULL:
-                       defselec = DEFAULT_NOT_UNK_SEL;
-                       break;
-               default:
-                       elog(ERROR, "unrecognized nulltesttype: %d",
-                                (int) nulltesttype);
-                       return (Selectivity) 0;         /* keep compiler quiet */
-       }
-
-       /*
-        * Ignore any binary-compatible relabeling
-        */
-       if (IsA(arg, RelabelType))
-               arg = (Node *) ((RelabelType *) arg)->arg;
-
-       if (IsA(arg, Var) &&
-               (varRelid == 0 || varRelid == ((Var *) arg)->varno))
-               var = (Var *) arg;
-       else
-       {
-               /* punt if non-Var argument */
-               return (Selectivity) defselec;
-       }
  
-       relid = getrelid(var->varno, root->rtable);
-       if (relid == InvalidOid)
-               return (Selectivity) defselec;
+       examine_variable(root, arg, varRelid, &vardata);
  
-       /* get stats for the attribute, if available */
-       statsTuple = SearchSysCache(STATRELATT,
-                                                               ObjectIdGetDatum(relid),
-                                                               Int16GetDatum(var->varattno),
-                                                               0, 0);
-       if (HeapTupleIsValid(statsTuple))
+       if (HeapTupleIsValid(vardata.statsTuple))
         {
                 Form_pg_statistic stats;
+               double          freq_null;
  
-               stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
+               stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
                 freq_null = stats->stanullfrac;
  
                 switch (nulltesttype)
@@ -1380,17 +1317,29 @@ nulltestsel(Query *root, NullTestType nulltesttype, Node *arg, int varRelid)
                                          (int) nulltesttype);
                                 return (Selectivity) 0; /* keep compiler quiet */
                 }
-
-               ReleaseSysCache(statsTuple);
         }
         else
         {
                 /*
                  * No VACUUM ANALYZE stats available, so make a guess
                  */
-               selec = defselec;
+               switch (nulltesttype)
+               {
+                       case IS_NULL:
+                               selec = DEFAULT_UNK_SEL;
+                               break;
+                       case IS_NOT_NULL:
+                               selec = DEFAULT_NOT_UNK_SEL;
+                               break;
+                       default:
+                               elog(ERROR, "unrecognized nulltesttype: %d",
+                                        (int) nulltesttype);
+                               return (Selectivity) 0;         /* keep compiler quiet */
+               }
         }
  
+       ReleaseVariableStats(vardata);
+
         /* result should be in range, but make sure... */
         CLAMP_PROBABILITY(selec);
  
@@ -1407,293 +1356,257 @@ eqjoinsel(PG_FUNCTION_ARGS)
         Oid                     operator = PG_GETARG_OID(1);
         List       *args = (List *) PG_GETARG_POINTER(2);
         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
-       Var                *var1;
-       Var                *var2;
         double          selec;
+       VariableStatData vardata1;
+       VariableStatData vardata2;
+       double          nd1;
+       double          nd2;
+       Form_pg_statistic stats1 = NULL;
+       Form_pg_statistic stats2 = NULL;
+       bool            have_mcvs1 = false;
+       Datum      *values1 = NULL;
+       int                     nvalues1 = 0;
+       float4     *numbers1 = NULL;
+       int                     nnumbers1 = 0;
+       bool            have_mcvs2 = false;
+       Datum      *values2 = NULL;
+       int                     nvalues2 = 0;
+       float4     *numbers2 = NULL;
+       int                     nnumbers2 = 0;
+
+       get_join_variables(root, args, &vardata1, &vardata2);
+
+       nd1 = get_variable_numdistinct(&vardata1);
+       nd2 = get_variable_numdistinct(&vardata2);
+
+       if (HeapTupleIsValid(vardata1.statsTuple))
+       {
+               stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
+               have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
+                                                                         vardata1.atttype,
+                                                                         vardata1.atttypmod,
+                                                                         STATISTIC_KIND_MCV,
+                                                                         InvalidOid,
+                                                                         &values1, &nvalues1,
+                                                                         &numbers1, &nnumbers1);
+       }
  
-       get_join_vars(args, &var1, &var2);
+       if (HeapTupleIsValid(vardata2.statsTuple))
+       {
+               stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
+               have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
+                                                                         vardata2.atttype,
+                                                                         vardata2.atttypmod,
+                                                                         STATISTIC_KIND_MCV,
+                                                                         InvalidOid,
+                                                                         &values2, &nvalues2,
+                                                                         &numbers2, &nnumbers2);
+       }
  
-       if (var1 == NULL && var2 == NULL)
-               selec = DEFAULT_EQ_SEL;
-       else
+       if (have_mcvs1 && have_mcvs2)
         {
-               HeapTuple       statsTuple1 = NULL;
-               HeapTuple       statsTuple2 = NULL;
-               Form_pg_statistic stats1 = NULL;
-               Form_pg_statistic stats2 = NULL;
-               double          nd1 = DEFAULT_NUM_DISTINCT;
-               double          nd2 = DEFAULT_NUM_DISTINCT;
-               bool            have_mcvs1 = false;
-               Datum      *values1 = NULL;
-               int                     nvalues1 = 0;
-               float4     *numbers1 = NULL;
-               int                     nnumbers1 = 0;
-               bool            have_mcvs2 = false;
-               Datum      *values2 = NULL;
-               int                     nvalues2 = 0;
-               float4     *numbers2 = NULL;
-               int                     nnumbers2 = 0;
-
-               if (var1 != NULL)
-               {
-                       /* get stats for the attribute, if available */
-                       Oid                     relid1 = getrelid(var1->varno, root->rtable);
+               /*
+                * We have most-common-value lists for both relations.  Run
+                * through the lists to see which MCVs actually join to each
+                * other with the given operator.  This allows us to determine
+                * the exact join selectivity for the portion of the relations
+                * represented by the MCV lists.  We still have to estimate
+                * for the remaining population, but in a skewed distribution
+                * this gives us a big leg up in accuracy.      For motivation see
+                * the analysis in Y. Ioannidis and S. Christodoulakis, "On
+                * the propagation of errors in the size of join results",
+                * Technical Report 1018, Computer Science Dept., University
+                * of Wisconsin, Madison, March 1991 (available from
+                * ftp.cs.wisc.edu).
+                */
+               FmgrInfo        eqproc;
+               bool       *hasmatch1;
+               bool       *hasmatch2;
+               double          nullfrac1 = stats1->stanullfrac;
+               double          nullfrac2 = stats2->stanullfrac;
+               double          matchprodfreq,
+                                       matchfreq1,
+                                       matchfreq2,
+                                       unmatchfreq1,
+                                       unmatchfreq2,
+                                       otherfreq1,
+                                       otherfreq2,
+                                       totalsel1,
+                                       totalsel2;
+               int                     i,
+                                       nmatches;
+
+               fmgr_info(get_opcode(operator), &eqproc);
+               hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
+               hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
  
-                       if (relid1 != InvalidOid)
-                       {
-                               statsTuple1 = SearchSysCache(STATRELATT,
-                                                                                        ObjectIdGetDatum(relid1),
-                                                                                  Int16GetDatum(var1->varattno),
-                                                                                        0, 0);
-                               if (HeapTupleIsValid(statsTuple1))
-                               {
-                                       stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
-                                       have_mcvs1 = get_attstatsslot(statsTuple1,
-                                                                                                 var1->vartype,
-                                                                                                 var1->vartypmod,
-                                                                                                 STATISTIC_KIND_MCV,
-                                                                                                 InvalidOid,
-                                                                                                 &values1, &nvalues1,
-                                                                                                 &numbers1, &nnumbers1);
-                               }
+               /*
+                * If we are doing any variant of JOIN_IN, pretend all the
+                * values of the righthand relation are unique (ie, act as if
+                * it's been DISTINCT'd).
+                *
+                * NOTE: it might seem that we should unique-ify the lefthand
+                * input when considering JOIN_REVERSE_IN.      But this is not
+                * so, because the join clause we've been handed has not been
+                * commuted from the way the parser originally wrote it.  We
+                * know that the unique side of the IN clause is *always* on
+                * the right.
+                *
+                * NOTE: it would be dangerous to try to be smart about JOIN_LEFT
+                * or JOIN_RIGHT here, because we do not have enough
+                * information to determine which var is really on which side
+                * of the join. Perhaps someday we should pass in more
+                * information.
+                */
+               if (jointype == JOIN_IN ||
+                       jointype == JOIN_REVERSE_IN ||
+                       jointype == JOIN_UNIQUE_INNER ||
+                       jointype == JOIN_UNIQUE_OUTER)
+               {
+                       float4          oneovern = 1.0 / nd2;
  
-                               nd1 = get_att_numdistinct(root, var1, stats1);
-                       }
+                       for (i = 0; i < nvalues2; i++)
+                               numbers2[i] = oneovern;
+                       nullfrac2 = oneovern;
                 }
  
-               if (var2 != NULL)
+               /*
+                * Note we assume that each MCV will match at most one member
+                * of the other MCV list.  If the operator isn't really
+                * equality, there could be multiple matches --- but we don't
+                * look for them, both for speed and because the math wouldn't
+                * add up...
+                */
+               matchprodfreq = 0.0;
+               nmatches = 0;
+               for (i = 0; i < nvalues1; i++)
                 {
-                       /* get stats for the attribute, if available */
-                       Oid                     relid2 = getrelid(var2->varno, root->rtable);
+                       int                     j;
  
-                       if (relid2 != InvalidOid)
+                       for (j = 0; j < nvalues2; j++)
                         {
-                               statsTuple2 = SearchSysCache(STATRELATT,
-                                                                                        ObjectIdGetDatum(relid2),
-                                                                                  Int16GetDatum(var2->varattno),
-                                                                                        0, 0);
-                               if (HeapTupleIsValid(statsTuple2))
+                               if (hasmatch2[j])
+                                       continue;
+                               if (DatumGetBool(FunctionCall2(&eqproc,
+                                                                                          values1[i],
+                                                                                          values2[j])))
                                 {
-                                       stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
-                                       have_mcvs2 = get_attstatsslot(statsTuple2,
-                                                                                                 var2->vartype,
-                                                                                                 var2->vartypmod,
-                                                                                                 STATISTIC_KIND_MCV,
-                                                                                                 InvalidOid,
-                                                                                                 &values2, &nvalues2,
-                                                                                                 &numbers2, &nnumbers2);
+                                       hasmatch1[i] = hasmatch2[j] = true;
+                                       matchprodfreq += numbers1[i] * numbers2[j];
+                                       nmatches++;
+                                       break;
                                 }
-
-                               nd2 = get_att_numdistinct(root, var2, stats2);
                         }
                 }
-
-               if (have_mcvs1 && have_mcvs2)
+               CLAMP_PROBABILITY(matchprodfreq);
+               /* Sum up frequencies of matched and unmatched MCVs */
+               matchfreq1 = unmatchfreq1 = 0.0;
+               for (i = 0; i < nvalues1; i++)
                 {
-                       /*
-                        * We have most-common-value lists for both relations.  Run
-                        * through the lists to see which MCVs actually join to each
-                        * other with the given operator.  This allows us to determine
-                        * the exact join selectivity for the portion of the relations
-                        * represented by the MCV lists.  We still have to estimate
-                        * for the remaining population, but in a skewed distribution
-                        * this gives us a big leg up in accuracy.      For motivation see
-                        * the analysis in Y. Ioannidis and S. Christodoulakis, "On
-                        * the propagation of errors in the size of join results",
-                        * Technical Report 1018, Computer Science Dept., University
-                        * of Wisconsin, Madison, March 1991 (available from
-                        * ftp.cs.wisc.edu).
-                        */
-                       FmgrInfo        eqproc;
-                       bool       *hasmatch1;
-                       bool       *hasmatch2;
-                       double          nullfrac1 = stats1->stanullfrac;
-                       double          nullfrac2 = stats2->stanullfrac;
-                       double          matchprodfreq,
-                                               matchfreq1,
-                                               matchfreq2,
-                                               unmatchfreq1,
-                                               unmatchfreq2,
-                                               otherfreq1,
-                                               otherfreq2,
-                                               totalsel1,
-                                               totalsel2;
-                       int                     i,
-                                               nmatches;
-
-                       fmgr_info(get_opcode(operator), &eqproc);
-                       hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
-                       hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
-
-                       /*
-                        * If we are doing any variant of JOIN_IN, pretend all the
-                        * values of the righthand relation are unique (ie, act as if
-                        * it's been DISTINCT'd).
-                        *
-                        * NOTE: it might seem that we should unique-ify the lefthand
-                        * input when considering JOIN_REVERSE_IN.      But this is not
-                        * so, because the join clause we've been handed has not been
-                        * commuted from the way the parser originally wrote it.  We
-                        * know that the unique side of the IN clause is *always* on
-                        * the right.
-                        *
-                        * NOTE: it would be dangerous to try to be smart about JOIN_LEFT
-                        * or JOIN_RIGHT here, because we do not have enough
-                        * information to determine which var is really on which side
-                        * of the join. Perhaps someday we should pass in more
-                        * information.
-                        */
-                       if (jointype == JOIN_IN ||
-                               jointype == JOIN_REVERSE_IN ||
-                               jointype == JOIN_UNIQUE_INNER ||
-                               jointype == JOIN_UNIQUE_OUTER)
-                       {
-                               float4          oneovern = 1.0 / nd2;
-
-                               for (i = 0; i < nvalues2; i++)
-                                       numbers2[i] = oneovern;
-                               nullfrac2 = oneovern;
-                       }
-
-                       /*
-                        * Note we assume that each MCV will match at most one member
-                        * of the other MCV list.  If the operator isn't really
-                        * equality, there could be multiple matches --- but we don't
-                        * look for them, both for speed and because the math wouldn't
-                        * add up...
-                        */
-                       matchprodfreq = 0.0;
-                       nmatches = 0;
-                       for (i = 0; i < nvalues1; i++)
-                       {
-                               int                     j;
+                       if (hasmatch1[i])
+                               matchfreq1 += numbers1[i];
+                       else
+                               unmatchfreq1 += numbers1[i];
+               }
+               CLAMP_PROBABILITY(matchfreq1);
+               CLAMP_PROBABILITY(unmatchfreq1);
+               matchfreq2 = unmatchfreq2 = 0.0;
+               for (i = 0; i < nvalues2; i++)
+               {
+                       if (hasmatch2[i])
+                               matchfreq2 += numbers2[i];
+                       else
+                               unmatchfreq2 += numbers2[i];
+               }
+               CLAMP_PROBABILITY(matchfreq2);
+               CLAMP_PROBABILITY(unmatchfreq2);
+               pfree(hasmatch1);
+               pfree(hasmatch2);
  
-                               for (j = 0; j < nvalues2; j++)
-                               {
-                                       if (hasmatch2[j])
-                                               continue;
-                                       if (DatumGetBool(FunctionCall2(&eqproc,
-                                                                                                  values1[i],
-                                                                                                  values2[j])))
-                                       {
-                                               hasmatch1[i] = hasmatch2[j] = true;
-                                               matchprodfreq += numbers1[i] * numbers2[j];
-                                               nmatches++;
-                                               break;
-                                       }
-                               }
-                       }
-                       CLAMP_PROBABILITY(matchprodfreq);
-                       /* Sum up frequencies of matched and unmatched MCVs */
-                       matchfreq1 = unmatchfreq1 = 0.0;
-                       for (i = 0; i < nvalues1; i++)
-                       {
-                               if (hasmatch1[i])
-                                       matchfreq1 += numbers1[i];
-                               else
-                                       unmatchfreq1 += numbers1[i];
-                       }
-                       CLAMP_PROBABILITY(matchfreq1);
-                       CLAMP_PROBABILITY(unmatchfreq1);
-                       matchfreq2 = unmatchfreq2 = 0.0;
-                       for (i = 0; i < nvalues2; i++)
-                       {
-                               if (hasmatch2[i])
-                                       matchfreq2 += numbers2[i];
-                               else
-                                       unmatchfreq2 += numbers2[i];
-                       }
-                       CLAMP_PROBABILITY(matchfreq2);
-                       CLAMP_PROBABILITY(unmatchfreq2);
-                       pfree(hasmatch1);
-                       pfree(hasmatch2);
+               /*
+                * Compute total frequency of non-null values that are not in
+                * the MCV lists.
+                */
+               otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
+               otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
+               CLAMP_PROBABILITY(otherfreq1);
+               CLAMP_PROBABILITY(otherfreq2);
  
-                       /*
-                        * Compute total frequency of non-null values that are not in
-                        * the MCV lists.
-                        */
-                       otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
-                       otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
-                       CLAMP_PROBABILITY(otherfreq1);
-                       CLAMP_PROBABILITY(otherfreq2);
+               /*
+                * We can estimate the total selectivity from the point of
+                * view of relation 1 as: the known selectivity for matched
+                * MCVs, plus unmatched MCVs that are assumed to match against
+                * random members of relation 2's non-MCV population, plus
+                * non-MCV values that are assumed to match against random
+                * members of relation 2's unmatched MCVs plus non-MCV values.
+                */
+               totalsel1 = matchprodfreq;
+               if (nd2 > nvalues2)
+                       totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
+               if (nd2 > nmatches)
+                       totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
+                               (nd2 - nmatches);
+               /* Same estimate from the point of view of relation 2. */
+               totalsel2 = matchprodfreq;
+               if (nd1 > nvalues1)
+                       totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
+               if (nd1 > nmatches)
+                       totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
+                               (nd1 - nmatches);
  
-                       /*
-                        * We can estimate the total selectivity from the point of
-                        * view of relation 1 as: the known selectivity for matched
-                        * MCVs, plus unmatched MCVs that are assumed to match against
-                        * random members of relation 2's non-MCV population, plus
-                        * non-MCV values that are assumed to match against random
-                        * members of relation 2's unmatched MCVs plus non-MCV values.
-                        */
-                       totalsel1 = matchprodfreq;
-                       if (nd2 > nvalues2)
-                               totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
-                       if (nd2 > nmatches)
-                               totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
-                                       (nd2 - nmatches);
-                       /* Same estimate from the point of view of relation 2. */
-                       totalsel2 = matchprodfreq;
-                       if (nd1 > nvalues1)
-                               totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
-                       if (nd1 > nmatches)
-                               totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
-                                       (nd1 - nmatches);
+               /*
+                * Use the smaller of the two estimates.  This can be
+                * justified in essentially the same terms as given below for
+                * the no-stats case: to a first approximation, we are
+                * estimating from the point of view of the relation with
+                * smaller nd.
+                */
+               selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
+       }
+       else
+       {
+               /*
+                * We do not have MCV lists for both sides.  Estimate the join
+                * selectivity as
+                * MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This is
+                * plausible if we assume that the join operator is strict and
+                * the non-null values are about equally distributed: a given
+                * non-null tuple of rel1 will join to either zero or
+                * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are
+                * at most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
+                * selectivity of not more than
+                * (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it is
+                * not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the
+                * expression with MIN() is an upper bound.  Using the MIN()
+                * means we estimate from the point of view of the relation
+                * with smaller nd (since the larger nd is determining the
+                * MIN).  It is reasonable to assume that most tuples in this
+                * rel will have join partners, so the bound is probably
+                * reasonably tight and should be taken as-is.
+                *
+                * XXX Can we be smarter if we have an MCV list for just one
+                * side? It seems that if we assume equal distribution for the
+                * other side, we end up with the same answer anyway.
+                */
+               double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
+               double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
  
-                       /*
-                        * Use the smaller of the two estimates.  This can be
-                        * justified in essentially the same terms as given below for
-                        * the no-stats case: to a first approximation, we are
-                        * estimating from the point of view of the relation with
-                        * smaller nd.
-                        */
-                       selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
-               }
+               selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
+               if (nd1 > nd2)
+                       selec /= nd1;
                 else
-               {
-                       /*
-                        * We do not have MCV lists for both sides.  Estimate the join
-                        * selectivity as
-                        * MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This is
-                        * plausible if we assume that the join operator is strict and
-                        * the non-null values are about equally distributed: a given
-                        * non-null tuple of rel1 will join to either zero or
-                        * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are
-                        * at most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
-                        * selectivity of not more than
-                        * (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it is
-                        * not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the
-                        * expression with MIN() is an upper bound.  Using the MIN()
-                        * means we estimate from the point of view of the relation
-                        * with smaller nd (since the larger nd is determining the
-                        * MIN).  It is reasonable to assume that most tuples in this
-                        * rel will have join partners, so the bound is probably
-                        * reasonably tight and should be taken as-is.
-                        *
-                        * XXX Can we be smarter if we have an MCV list for just one
-                        * side? It seems that if we assume equal distribution for the
-                        * other side, we end up with the same answer anyway.
-                        */
-                       double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
-                       double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
+                       selec /= nd2;
+       }
  
-                       selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
-                       if (nd1 > nd2)
-                               selec /= nd1;
-                       else
-                               selec /= nd2;
-               }
+       if (have_mcvs1)
+               free_attstatsslot(vardata1.atttype, values1, nvalues1,
+                                                 numbers1, nnumbers1);
+       if (have_mcvs2)
+               free_attstatsslot(vardata2.atttype, values2, nvalues2,
+                                                 numbers2, nnumbers2);
  
-               if (have_mcvs1)
-                       free_attstatsslot(var1->vartype, values1, nvalues1,
-                                                         numbers1, nnumbers1);
-               if (have_mcvs2)
-                       free_attstatsslot(var2->vartype, values2, nvalues2,
-                                                         numbers2, nnumbers2);
-               if (HeapTupleIsValid(statsTuple1))
-                       ReleaseSysCache(statsTuple1);
-               if (HeapTupleIsValid(statsTuple2))
-                       ReleaseSysCache(statsTuple2);
-       }
+       ReleaseVariableStats(vardata1);
+       ReleaseVariableStats(vardata2);
  
         CLAMP_PROBABILITY(selec);
  
@@ -1860,8 +1773,10 @@ mergejoinscansel(Query *root, Node *clause,
                                  Selectivity *leftscan,
                                  Selectivity *rightscan)
  {
-       Var                *left,
+       Node       *left,
                            *right;
+       VariableStatData leftvar,
+                               rightvar;
         Oid                     lefttype,
                                 righttype;
         Oid                     opno,
@@ -1883,42 +1798,31 @@ mergejoinscansel(Query *root, Node *clause,
         if (!is_opclause(clause))
                 return;                                 /* shouldn't happen */
         opno = ((OpExpr *) clause)->opno;
-       left = (Var *) get_leftop((Expr *) clause);
-       right = (Var *) get_rightop((Expr *) clause);
+       left = get_leftop((Expr *) clause);
+       right = get_rightop((Expr *) clause);
         if (!right)
                 return;                                 /* shouldn't happen */
  
-       /* Save the direct input types of the operator */
-       lefttype = exprType((Node *) left);
-       righttype = exprType((Node *) right);
+       /* Look for stats for the inputs */
+       examine_variable(root, left, 0, &leftvar);
+       examine_variable(root, right, 0, &rightvar);
  
-       /*
-        * Now skip any binary-compatible relabeling; there can only be one
-        * level since constant-expression folder eliminates adjacent
-        * RelabelTypes.
-        */
-       if (IsA(left, RelabelType))
-               left = (Var *) ((RelabelType *) left)->arg;
-       if (IsA(right, RelabelType))
-               right = (Var *) ((RelabelType *) right)->arg;
-
-       /* Can't do anything if inputs are not Vars */
-       if (!IsA(left, Var) ||
-               !IsA(right, Var))
-               return;
+       /* Get the direct input types of the operator */
+       lefttype = exprType(left);
+       righttype = exprType(right);
  
         /* Verify mergejoinability and get left and right "<" operators */
         if (!op_mergejoinable(opno,
                                                   &lsortop,
                                                   &rsortop))
-               return;                                 /* shouldn't happen */
+               goto fail;                              /* shouldn't happen */
  
-       /* Try to get maximum values of both vars */
-       if (!get_var_maximum(root, left, lsortop, &leftmax))
-               return;                                 /* no max available from stats */
+       /* Try to get maximum values of both inputs */
+       if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
+               goto fail;                              /* no max available from stats */
  
-       if (!get_var_maximum(root, right, rsortop, &rightmax))
-               return;                                 /* no max available from stats */
+       if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
+               goto fail;                              /* no max available from stats */
  
         /* Look up the "left < right" and "left > right" operators */
         op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);
@@ -1926,30 +1830,30 @@ mergejoinscansel(Query *root, Node *clause,
         /* Look up the "left <= right" operator */
         leop = get_negator(gtop);
         if (!OidIsValid(leop))
-               return;                                 /* insufficient info in catalogs */
+               goto fail;                              /* insufficient info in catalogs */
  
         /* Look up the "right > left" operator */
         revgtop = get_commutator(ltop);
         if (!OidIsValid(revgtop))
-               return;                                 /* insufficient info in catalogs */
+               goto fail;                              /* insufficient info in catalogs */
  
         /* Look up the "right <= left" operator */
         revleop = get_negator(revgtop);
         if (!OidIsValid(revleop))
-               return;                                 /* insufficient info in catalogs */
+               goto fail;                              /* insufficient info in catalogs */
  
         /*
          * Now, the fraction of the left variable that will be scanned is the
          * fraction that's <= the right-side maximum value.  But only believe
          * non-default estimates, else stick with our 1.0.
          */
-       selec = scalarineqsel(root, leop, false, left,
+       selec = scalarineqsel(root, leop, false, &leftvar,
                                                   rightmax, righttype);
         if (selec != DEFAULT_INEQ_SEL)
                 *leftscan = selec;
  
         /* And similarly for the right variable. */
-       selec = scalarineqsel(root, revleop, false, right,
+       selec = scalarineqsel(root, revleop, false, &rightvar,
                                                   leftmax, lefttype);
         if (selec != DEFAULT_INEQ_SEL)
                 *rightscan = selec;
@@ -1966,6 +1870,10 @@ mergejoinscansel(Query *root, Node *clause,
                 *rightscan = 1.0;
         else
                 *leftscan = *rightscan = 1.0;
+
+fail:
+       ReleaseVariableStats(leftvar);
+       ReleaseVariableStats(rightvar);
  }
  
  /*
@@ -2076,25 +1984,14 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
         foreach(l, allvars)
         {
                 Var                *var = (Var *) lfirst(l);
-               Oid                     relid = getrelid(var->varno, root->rtable);
-               HeapTuple       statsTuple = NULL;
-               Form_pg_statistic stats = NULL;
+               VariableStatData vardata;
                 double          ndistinct;
                 bool            keep = true;
                 List       *l2;
  
-               if (OidIsValid(relid))
-               {
-                       statsTuple = SearchSysCache(STATRELATT,
-                                                                               ObjectIdGetDatum(relid),
-                                                                               Int16GetDatum(var->varattno),
-                                                                               0, 0);
-                       if (HeapTupleIsValid(statsTuple))
-                               stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
-               }
-               ndistinct = get_att_numdistinct(root, var, stats);
-               if (HeapTupleIsValid(statsTuple))
-                       ReleaseSysCache(statsTuple);
+               examine_variable(root, (Node *) var, 0, &vardata);
+               ndistinct = get_variable_numdistinct(&vardata);
+               ReleaseVariableStats(vardata);
  
                 /* cannot use foreach here because of possible lremove */
                 l2 = varinfos;
@@ -2201,143 +2098,152 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
         return numdistinct;
  }
  
-
-/*-------------------------------------------------------------------------
+/*
+ * Estimate hash bucketsize fraction (ie, number of entries in a bucket
+ * divided by total tuples in relation) if the specified expression is used
+ * as a hash key.
   *
- * Support routines
+ * XXX This is really pretty bogus since we're effectively assuming that the
+ * distribution of hash keys will be the same after applying restriction
+ * clauses as it was in the underlying relation.  However, we are not nearly
+ * smart enough to figure out how the restrict clauses might change the
+ * distribution, so this will have to do for now.
   *
- *-------------------------------------------------------------------------
- */
-
-/*
- * get_var_maximum
- *             Estimate the maximum value of the specified variable.
- *             If successful, store value in *max and return TRUE.
- *             If no data available, return FALSE.
+ * We are passed the number of buckets the executor will use for the given
+ * input relation.     If the data were perfectly distributed, with the same
+ * number of tuples going into each available bucket, then the bucketsize
+ * fraction would be 1/nbuckets.  But this happy state of affairs will occur
+ * only if (a) there are at least nbuckets distinct data values, and (b)
+ * we have a not-too-skewed data distribution. Otherwise the buckets will
+ * be nonuniformly occupied.  If the other relation in the join has a key
+ * distribution similar to this one's, then the most-loaded buckets are
+ * exactly those that will be probed most often.  Therefore, the "average"
+ * bucket size for costing purposes should really be taken as something close
+ * to the "worst case" bucket size.  We try to estimate this by adjusting the
+ * fraction if there are too few distinct data values, and then scaling up
+ * by the ratio of the most common value's frequency to the average frequency.
   *
- * sortop is the "<" comparison operator to use.  (To extract the
- * minimum instead of the maximum, just pass the ">" operator instead.)
+ * If no statistics are available, use a default estimate of 0.1.  This will
+ * discourage use of a hash rather strongly if the inner relation is large,
+ * which is what we want.  We do not want to hash unless we know that the
+ * inner rel is well-dispersed (or the alternatives seem much worse).
   */
-static bool
-get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max)
+Selectivity
+estimate_hash_bucketsize(Query *root, Node *hashkey, int nbuckets)
  {
-       Datum           tmax = 0;
-       bool            have_max = false;
-       Oid                     relid;
-       HeapTuple       statsTuple;
-       Form_pg_statistic stats;
-       int16           typLen;
-       bool            typByVal;
-       Datum      *values;
-       int                     nvalues;
-       int                     i;
+       VariableStatData vardata;
+       double          estfract,
+                               ndistinct,
+                               stanullfrac,
+                               mcvfreq,
+                               avgfreq;
+       float4     *numbers;
+       int                     nnumbers;
  
-       relid = getrelid(var->varno, root->rtable);
-       if (relid == InvalidOid)
-               return false;
+       examine_variable(root, hashkey, 0, &vardata);
  
-       /* get stats for the attribute */
-       statsTuple = SearchSysCache(STATRELATT,
-                                                               ObjectIdGetDatum(relid),
-                                                               Int16GetDatum(var->varattno),
-                                                               0, 0);
-       if (!HeapTupleIsValid(statsTuple))
+       /* Get number of distinct values and fraction that are null */
+       ndistinct = get_variable_numdistinct(&vardata);
+
+       if (HeapTupleIsValid(vardata.statsTuple))
         {
-               /* no stats available, so default result */
-               return false;
+               Form_pg_statistic stats;
+
+               stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+               stanullfrac = stats->stanullfrac;
+       }
+       else
+       {
+               /*
+                * Believe a default ndistinct only if it came from stats.
+                * Otherwise punt and return 0.1, per comments above.
+                */
+               if (ndistinct == DEFAULT_NUM_DISTINCT)
+               {
+                       ReleaseVariableStats(vardata);
+                       return (Selectivity) 0.1;
+               }
+
+               stanullfrac = 0.0;
         }
-       stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
  
-       get_typlenbyval(var->vartype, &typLen, &typByVal);
+       /* Compute avg freq of all distinct data values in raw relation */
+       avgfreq = (1.0 - stanullfrac) / ndistinct;
  
         /*
-        * If there is a histogram, grab the last or first value as
-        * appropriate.
+        * Adjust ndistinct to account for restriction clauses.  Observe we
+        * are assuming that the data distribution is affected uniformly by
+        * the restriction clauses!
          *
-        * If there is a histogram that is sorted with some other operator than
-        * the one we want, fail --- this suggests that there is data we can't
-        * use.
+        * XXX Possibly better way, but much more expensive: multiply by
+        * selectivity of rel's restriction clauses that mention the target
+        * Var.
          */
-       if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                                                STATISTIC_KIND_HISTOGRAM, sortop,
-                                                &values, &nvalues,
-                                                NULL, NULL))
-       {
-               if (nvalues > 0)
-               {
-                       tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
-                       have_max = true;
-               }
-               free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-       }
+       if (vardata.rel)
+               ndistinct *= vardata.rel->rows / vardata.rel->tuples;
+
+       /*
+        * Initial estimate of bucketsize fraction is 1/nbuckets as long as
+        * the number of buckets is less than the expected number of distinct
+        * values; otherwise it is 1/ndistinct.
+        */
+       if (ndistinct > (double) nbuckets)
+               estfract = 1.0 / (double) nbuckets;
         else
-       {
-               Oid                     rsortop = get_commutator(sortop);
+               estfract = 1.0 / ndistinct;
  
-               if (OidIsValid(rsortop) &&
-                       get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                                                        STATISTIC_KIND_HISTOGRAM, rsortop,
-                                                        &values, &nvalues,
-                                                        NULL, NULL))
-               {
-                       if (nvalues > 0)
-                       {
-                               tmax = datumCopy(values[0], typByVal, typLen);
-                               have_max = true;
-                       }
-                       free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-               }
-               else if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                                                                 STATISTIC_KIND_HISTOGRAM, InvalidOid,
-                                                                 &values, &nvalues,
-                                                                 NULL, NULL))
+       /*
+        * Look up the frequency of the most common value, if available.
+        */
+       mcvfreq = 0.0;
+
+       if (HeapTupleIsValid(vardata.statsTuple))
+       {
+               if (get_attstatsslot(vardata.statsTuple,
+                                                        vardata.atttype, vardata.atttypmod,
+                                                        STATISTIC_KIND_MCV, InvalidOid,
+                                                        NULL, NULL, &numbers, &nnumbers))
                 {
-                       free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-                       ReleaseSysCache(statsTuple);
-                       return false;
+                       /*
+                        * The first MCV stat is for the most common value.
+                        */
+                       if (nnumbers > 0)
+                               mcvfreq = numbers[0];
+                       free_attstatsslot(vardata.atttype, NULL, 0,
+                                                         numbers, nnumbers);
                 }
         }
  
         /*
-        * If we have most-common-values info, look for a large MCV.  This is
-        * needed even if we also have a histogram, since the histogram
-        * excludes the MCVs.  However, usually the MCVs will not be the
-        * extreme values, so avoid unnecessary data copying.
+        * Adjust estimated bucketsize upward to account for skewed
+        * distribution.
          */
-       if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
-                                                STATISTIC_KIND_MCV, InvalidOid,
-                                                &values, &nvalues,
-                                                NULL, NULL))
-       {
-               bool            large_mcv = false;
-               FmgrInfo        opproc;
-
-               fmgr_info(get_opcode(sortop), &opproc);
+       if (avgfreq > 0.0 && mcvfreq > avgfreq)
+               estfract *= mcvfreq / avgfreq;
  
-               for (i = 0; i < nvalues; i++)
-               {
-                       if (!have_max)
-                       {
-                               tmax = values[i];
-                               large_mcv = have_max = true;
-                       }
-                       else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
-                       {
-                               tmax = values[i];
-                               large_mcv = true;
-                       }
-               }
-               if (large_mcv)
-                       tmax = datumCopy(tmax, typByVal, typLen);
-               free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
-       }
+       /*
+        * Clamp bucketsize to sane range (the above adjustment could easily
+        * produce an out-of-range result).  We set the lower bound a little
+        * above zero, since zero isn't a very sane result.
+        */
+       if (estfract < 1.0e-6)
+               estfract = 1.0e-6;
+       else if (estfract > 1.0)
+               estfract = 1.0;
  
-       ReleaseSysCache(statsTuple);
+       ReleaseVariableStats(vardata);
  
-       *max = tmax;
-       return have_max;
+       return (Selectivity) estfract;
  }
  
+
+/*-------------------------------------------------------------------------
+ *
+ * Support routines
+ *
+ *-------------------------------------------------------------------------
+ */
+
  /*
   * convert_to_scalar
   *       Convert non-NULL values of the indicated types to the comparison
@@ -2903,185 +2809,522 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
  
  
  /*
- * get_att_numdistinct
- *       Estimate the number of distinct values of an attribute.
+ * get_restriction_variable
+ *             Examine the args of a restriction clause to see if it's of the
+ *             form (variable op pseudoconstant) or (pseudoconstant op variable),
+ *             where "variable" could be either a Var or an expression in vars of a
+ *             single relation.  If so, extract information about the variable,
+ *             and also indicate which side it was on and the other argument.
   *
- * var: identifies the attribute to examine.
- * stats: pg_statistic tuple for attribute, or NULL if not available.
+ * Inputs:
+ *     root: the Query
+ *     args: clause argument list
+ *     varRelid: see specs for restriction selectivity functions
   *
- * NB: be careful to produce an integral result, since callers may compare
- * the result to exact integer counts.
+ * Outputs: (these are valid only if TRUE is returned)
+ *     *vardata: gets information about variable (see examine_variable)
+ *     *other: gets other clause argument, stripped of binary relabeling
+ *     *varonleft: set TRUE if variable is on the left, FALSE if on the right
+ *
+ * Returns TRUE if a variable is identified, otherwise FALSE.
+ *
+ * Note: if there are Vars on both sides of the clause, we must fail, because
+ * callers are expecting that the other side will act like a pseudoconstant.
   */
-static double
-get_att_numdistinct(Query *root, Var *var, Form_pg_statistic stats)
+static bool
+get_restriction_variable(Query *root, List *args, int varRelid,
+                                                VariableStatData *vardata, Node **other,
+                                                bool *varonleft)
  {
-       RelOptInfo *rel;
-       double          ntuples;
-
-       /*
-        * Special-case boolean columns: presumably, two distinct values.
-        *
-        * Are there any other cases we should wire in special estimates for?
-        */
-       if (var->vartype == BOOLOID)
-               return 2.0;
+       Node       *left,
+                          *right;
+       VariableStatData rdata;
  
-       /*
-        * Otherwise we need to get the relation size.
-        */
-       rel = find_base_rel(root, var->varno);
-       ntuples = rel->tuples;
+       /* Fail if not a binary opclause (probably shouldn't happen) */
+       if (length(args) != 2)
+               return false;
  
-       if (ntuples <= 0.0)
-               return DEFAULT_NUM_DISTINCT;    /* no data available; return a
-                                                                                * default */
+       left = (Node *) lfirst(args);
+       right = (Node *) lsecond(args);
  
         /*
-        * Look to see if there is a unique index on the attribute. If so, we
-        * assume it's distinct, ignoring pg_statistic info which could be out
-        * of date.
+        * Examine both sides.  Note that when varRelid is nonzero, Vars of
+        * other relations will be treated as pseudoconstants.
          */
-       if (has_unique_index(rel, var->varattno))
-               return ntuples;
+       examine_variable(root, left, varRelid, vardata);
+       examine_variable(root, right, varRelid, &rdata);
  
         /*
-        * If ANALYZE determined a fixed or scaled estimate, use it.
+        * If one side is a variable and the other not, we win.
          */
-       if (stats)
+       if (vardata->rel && rdata.rel == NULL)
         {
-               if (stats->stadistinct > 0.0)
-                       return stats->stadistinct;
-               if (stats->stadistinct < 0.0)
-                       return floor((-stats->stadistinct * ntuples) + 0.5);
+               *varonleft = true;
+               *other = rdata.var;
+               /* Assume we need no ReleaseVariableStats(rdata) here */
+               return true;
         }
  
-       /*
-        * ANALYZE does not compute stats for system attributes, but some of
-        * them can reasonably be assumed unique anyway.
-        */
-       switch (var->varattno)
+       if (vardata->rel == NULL && rdata.rel)
         {
-               case ObjectIdAttributeNumber:
-               case SelfItemPointerAttributeNumber:
-                       return ntuples;
-               case TableOidAttributeNumber:
-                       return 1.0;
+               *varonleft = false;
+               *other = vardata->var;
+               /* Assume we need no ReleaseVariableStats(*vardata) here */
+               *vardata = rdata;
+               return true;
         }
  
-       /*
-        * Estimate ndistinct = ntuples if the table is small, else use
-        * default.
-        */
-       if (ntuples < DEFAULT_NUM_DISTINCT)
-               return ntuples;
+       /* Ooops, clause has wrong structure (probably var op var) */
+       ReleaseVariableStats(*vardata);
+       ReleaseVariableStats(rdata);
  
-       return DEFAULT_NUM_DISTINCT;
+       return false;
  }
  
  /*
- * get_restriction_var
- *             Examine the args of a restriction clause to see if it's of the
- *             form (var op something) or (something op var).  If so, extract
- *             and return the var and the other argument.
- *
- * Inputs:
- *     args: clause argument list
- *     varRelid: see specs for restriction selectivity functions
- *
- * Outputs: (these are set only if TRUE is returned)
- *     *var: gets Var node
- *     *other: gets other clause argument
- *     *varonleft: set TRUE if var is on the left, FALSE if on the right
- *
- * Returns TRUE if a Var is identified, otherwise FALSE.
+ * get_join_variables
+ *             Apply examine_variable() to each side of a join clause.
   */
-static bool
-get_restriction_var(List *args,
-                                       int varRelid,
-                                       Var **var,
-                                       Node **other,
-                                       bool *varonleft)
+static void
+get_join_variables(Query *root, List *args,
+                                  VariableStatData *vardata1, VariableStatData *vardata2)
  {
         Node       *left,
                            *right;
  
         if (length(args) != 2)
-               return false;
+               elog(ERROR, "join operator should take two arguments");
  
         left = (Node *) lfirst(args);
         right = (Node *) lsecond(args);
  
+       examine_variable(root, left, 0, vardata1);
+       examine_variable(root, right, 0, vardata2);
+}
+
+/*
+ * examine_variable
+ *             Try to look up statistical data about an expression.
+ *             Fill in a VariableStatData struct to describe the expression.
+ *
+ * Inputs:
+ *     root: the Query
+ *     node: the expression tree to examine
+ *     varRelid: see specs for restriction selectivity functions
+ *
+ * Outputs: *vardata is filled as follows:
+ *     var: the input expression (with any binary relabeling stripped)
+ *     rel: RelOptInfo for relation containing variable; NULL if expression
+ *             contains no Vars (NOTE this could point to a RelOptInfo of a
+ *             subquery, not one in the current query).
+ *     statsTuple: the pg_statistic entry for the variable, if one exists;
+ *             otherwise NULL.
+ *     atttype, atttypmod: type data to pass to get_attstatsslot().  This is
+ *             commonly the same as the exposed type of the variable argument,
+ *             but can be different in binary-compatible-type cases.
+ *
+ * Caller is responsible for doing ReleaseVariableStats() before exiting.
+ */
+static void
+examine_variable(Query *root, Node *node, int varRelid,
+                                VariableStatData *vardata)
+{
+       Relids          varnos;
+       RelOptInfo *onerel;
+
+       /* Make sure we don't return dangling pointers in vardata */
+       MemSet(vardata, 0, sizeof(VariableStatData));
+
         /* Ignore any binary-compatible relabeling */
  
-       if (IsA(left, RelabelType))
-               left = (Node *) ((RelabelType *) left)->arg;
-       if (IsA(right, RelabelType))
-               right = (Node *) ((RelabelType *) right)->arg;
+       if (IsA(node, RelabelType))
+               node = (Node *) ((RelabelType *) node)->arg;
  
-       /* Look for the var */
+       vardata->var = node;
  
-       if (IsA(left, Var) &&
-               (varRelid == 0 || varRelid == ((Var *) left)->varno))
+       /* Fast path for a simple Var */
+
+       if (IsA(node, Var) &&
+               (varRelid == 0 || varRelid == ((Var *) node)->varno))
         {
-               *var = (Var *) left;
-               *other = right;
-               *varonleft = true;
+               Var                *var = (Var *) node;
+               Oid                     relid;
+
+               vardata->rel = find_base_rel(root, var->varno);
+               vardata->atttype = var->vartype;
+               vardata->atttypmod = var->vartypmod;
+
+               relid = getrelid(var->varno, root->rtable);
+
+               if (OidIsValid(relid))
+               {
+                       vardata->statsTuple = SearchSysCache(STATRELATT,
+                                                                                                ObjectIdGetDatum(relid),
+                                                                                                Int16GetDatum(var->varattno),
+                                                                                                0, 0);
+               }
+               else
+               {
+                       /*
+                        * XXX This means the Var comes from a JOIN or sub-SELECT.  Later
+                        * add code to dig down into the join etc and see if we can trace
+                        * the variable to something with stats.  (But beware of
+                        * sub-SELECTs with DISTINCT/GROUP BY/etc.  Perhaps there are
+                        * no cases where this would really be useful, because we'd have
+                        * flattened the subselect if it is??)
+                        */
+               }
+
+               return;
         }
-       else if (IsA(right, Var) &&
-                        (varRelid == 0 || varRelid == ((Var *) right)->varno))
+
+       /*
+        * Okay, it's a more complicated expression.  Determine variable
+        * membership.  Note that when varRelid isn't zero, only vars of
+        * that relation are considered "real" vars.
+        */
+       varnos = pull_varnos(node);
+
+       onerel = NULL;
+
+       switch (bms_membership(varnos))
         {
-               *var = (Var *) right;
-               *other = left;
-               *varonleft = false;
+               case BMS_EMPTY_SET:
+                       /* No Vars at all ... must be pseudo-constant clause */
+                       break;
+               case BMS_SINGLETON:
+                       if (varRelid == 0 || bms_is_member(varRelid, varnos))
+                       {
+                               onerel = find_base_rel(root,
+                                                (varRelid ? varRelid : bms_singleton_member(varnos)));
+                               vardata->rel = onerel;
+                       }
+                       /* else treat it as a constant */
+                       break;
+               case BMS_MULTIPLE:
+                       if (varRelid == 0)
+                       {
+                               /* treat it as a variable of a join relation */
+                               vardata->rel = find_join_rel(root, varnos);
+                       }
+                       else if (bms_is_member(varRelid, varnos))
+                       {
+                               /* ignore the vars belonging to other relations */
+                               vardata->rel = find_base_rel(root, varRelid);
+                               /* note: no point in expressional-index search here */
+                       }
+                       /* else treat it as a constant */
+                       break;
+       }
+
+       bms_free(varnos);
+
+       vardata->atttype = exprType(node);
+       vardata->atttypmod = exprTypmod(node);
+
+       if (onerel)
+       {
+               /*
+                * We have an expression in vars of a single relation.  Try to
+                * match it to expressional index columns, in hopes of finding
+                * some statistics.
+                *
+                * XXX it's conceivable that there are multiple matches with
+                * different index opclasses; if so, we need to pick one that
+                * matches the operator we are estimating for.  FIXME later.
+                */
+               List       *ilist;
+
+               foreach(ilist, onerel->indexlist)
+               {
+                       IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
+                       List       *indexprs;
+                       int                     pos;
+
+                       indexprs = index->indexprs;
+                       if (indexprs == NIL)
+                               continue;               /* no expressions here... */
+
+                       /*
+                        * Ignore partial indexes since they probably don't reflect
+                        * whole-relation statistics.  Possibly reconsider this later.
+                        */
+                       if (index->indpred)
+                               continue;
+
+                       for (pos = 0; pos < index->ncolumns; pos++)
+                       {
+                               if (index->indexkeys[pos] == 0)
+                               {
+                                       Node       *indexkey;
+
+                                       if (indexprs == NIL)
+                                               elog(ERROR, "too few entries in indexprs list");
+                                       indexkey = (Node *) lfirst(indexprs);
+                                       if (indexkey && IsA(indexkey, RelabelType))
+                                               indexkey = (Node *) ((RelabelType *) indexkey)->arg;
+                                       if (equal(node, indexkey))
+                                       {
+                                               /*
+                                                * Found a match ... is it a unique index?
+                                                * Tests here should match has_unique_index().
+                                                */
+                                               if (index->unique &&
+                                                       index->ncolumns == 1 &&
+                                                       index->indpred == NIL)
+                                                       vardata->isunique = true;
+                                               /* Has it got stats? */
+                                               vardata->statsTuple = SearchSysCache(STATRELATT,
+                                                                                                                        ObjectIdGetDatum(index->indexoid),
+                                                                                                                        Int16GetDatum(pos + 1),
+                                                                                                                        0, 0);
+                                               if (vardata->statsTuple)
+                                                       break;
+                                       }
+                                       indexprs = lnext(indexprs);
+                               }
+                       }
+                       if (vardata->statsTuple)
+                               break;
+               }
+       }
+}
+
+/*
+ * get_variable_numdistinct
+ *       Estimate the number of distinct values of a variable.
+ *
+ * vardata: results of examine_variable
+ *
+ * NB: be careful to produce an integral result, since callers may compare
+ * the result to exact integer counts.
+ */
+static double
+get_variable_numdistinct(VariableStatData *vardata)
+{
+       double          stadistinct;
+       double          ntuples;
+
+       /*
+        * Determine the stadistinct value to use.  There are cases where
+        * we can get an estimate even without a pg_statistic entry, or
+        * can get a better value than is in pg_statistic.
+        */
+       if (HeapTupleIsValid(vardata->statsTuple))
+       {
+               /* Use the pg_statistic entry */
+               Form_pg_statistic stats;
+
+               stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+               stadistinct = stats->stadistinct;
+       }
+       else if (vardata->atttype == BOOLOID)
+       {
+               /*
+                * Special-case boolean columns: presumably, two distinct values.
+                *
+                * Are there any other datatypes we should wire in special
+                * estimates for?
+                */
+               stadistinct = 2.0;
         }
         else
         {
-               /* Duh, it's too complicated for me... */
-               return false;
+               /*
+                * We don't keep statistics for system columns, but in some
+                * cases we can infer distinctness anyway.
+                */
+               if (vardata->var && IsA(vardata->var, Var))
+               {
+                       switch (((Var *) vardata->var)->varattno)
+                       {
+                               case ObjectIdAttributeNumber:
+                               case SelfItemPointerAttributeNumber:
+                                       stadistinct = -1.0;                     /* unique */
+                                       break;
+                               case TableOidAttributeNumber:
+                                       stadistinct = 1.0;                      /* only 1 value */
+                                       break;
+                               default:
+                                       stadistinct = 0.0;                      /* means "unknown" */
+                                       break;
+                       }
+               }
+               else
+                       stadistinct = 0.0;                                      /* means "unknown" */
+               /*
+                * XXX consider using estimate_num_groups on expressions?
+                */
+       }
+
+       /*
+        * If there is a unique index for the variable, assume it is unique
+        * no matter what pg_statistic says (the statistics could be out
+        * of date).  Can skip search if we already think it's unique.
+        */
+       if (stadistinct != -1.0)
+       {
+               if (vardata->isunique)
+                       stadistinct = -1.0;
+               else if (vardata->var && IsA(vardata->var, Var) &&
+                                vardata->rel &&
+                                has_unique_index(vardata->rel, 
+                                                                 ((Var *) vardata->var)->varattno))
+                       stadistinct = -1.0;
         }
  
-       return true;
+       /*
+        * If we had an absolute estimate, use that.
+        */
+       if (stadistinct > 0.0)
+               return stadistinct;
+
+       /*
+        * Otherwise we need to get the relation size; punt if not available.
+        */
+       if (vardata->rel == NULL)
+               return DEFAULT_NUM_DISTINCT;
+       ntuples = vardata->rel->tuples;
+       if (ntuples <= 0.0)
+               return DEFAULT_NUM_DISTINCT;
+
+       /*
+        * If we had a relative estimate, use that.
+        */
+       if (stadistinct < 0.0)
+               return floor((-stadistinct * ntuples) + 0.5);
+
+       /*
+        * With no data, estimate ndistinct = ntuples if the table is small,
+        * else use default.
+        */
+       if (ntuples < DEFAULT_NUM_DISTINCT)
+               return ntuples;
+
+       return DEFAULT_NUM_DISTINCT;
  }
  
  /*
- * get_join_vars
+ * get_variable_maximum
+ *             Estimate the maximum value of the specified variable.
+ *             If successful, store value in *max and return TRUE.
+ *             If no data available, return FALSE.
   *
- * Extract the two Vars from a join clause's argument list.  Returns
- * NULL for arguments that are not simple vars.
+ * sortop is the "<" comparison operator to use.  (To extract the
+ * minimum instead of the maximum, just pass the ">" operator instead.)
   */
-static void
-get_join_vars(List *args, Var **var1, Var **var2)
+static bool
+get_variable_maximum(Query *root, VariableStatData *vardata,
+                                        Oid sortop, Datum *max)
  {
-       Node       *left,
-                          *right;
+       Datum           tmax = 0;
+       bool            have_max = false;
+       Form_pg_statistic stats;
+       int16           typLen;
+       bool            typByVal;
+       Datum      *values;
+       int                     nvalues;
+       int                     i;
  
-       if (length(args) != 2)
+       if (!HeapTupleIsValid(vardata->statsTuple))
         {
-               *var1 = NULL;
-               *var2 = NULL;
-               return;
+               /* no stats available, so default result */
+               return false;
         }
+       stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
  
-       left = (Node *) lfirst(args);
-       right = (Node *) lsecond(args);
+       get_typlenbyval(vardata->atttype, &typLen, &typByVal);
  
-       /* Ignore any binary-compatible relabeling */
-       if (IsA(left, RelabelType))
-               left = (Node *) ((RelabelType *) left)->arg;
-       if (IsA(right, RelabelType))
-               right = (Node *) ((RelabelType *) right)->arg;
-
-       if (IsA(left, Var))
-               *var1 = (Var *) left;
+       /*
+        * If there is a histogram, grab the last or first value as
+        * appropriate.
+        *
+        * If there is a histogram that is sorted with some other operator than
+        * the one we want, fail --- this suggests that there is data we can't
+        * use.
+        */
+       if (get_attstatsslot(vardata->statsTuple,
+                                                vardata->atttype, vardata->atttypmod,
+                                                STATISTIC_KIND_HISTOGRAM, sortop,
+                                                &values, &nvalues,
+                                                NULL, NULL))
+       {
+               if (nvalues > 0)
+               {
+                       tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
+                       have_max = true;
+               }
+               free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+       }
         else
-               *var1 = NULL;
+       {
+               Oid                     rsortop = get_commutator(sortop);
  
-       if (IsA(right, Var))
-               *var2 = (Var *) right;
-       else
-               *var2 = NULL;
+               if (OidIsValid(rsortop) &&
+                       get_attstatsslot(vardata->statsTuple,
+                                                        vardata->atttype, vardata->atttypmod,
+                                                        STATISTIC_KIND_HISTOGRAM, rsortop,
+                                                        &values, &nvalues,
+                                                        NULL, NULL))
+               {
+                       if (nvalues > 0)
+                       {
+                               tmax = datumCopy(values[0], typByVal, typLen);
+                               have_max = true;
+                       }
+                       free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+               }
+               else if (get_attstatsslot(vardata->statsTuple,
+                                                                 vardata->atttype, vardata->atttypmod,
+                                                                 STATISTIC_KIND_HISTOGRAM, InvalidOid,
+                                                                 &values, &nvalues,
+                                                                 NULL, NULL))
+               {
+                       free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+                       return false;
+               }
+       }
+
+       /*
+        * If we have most-common-values info, look for a large MCV.  This is
+        * needed even if we also have a histogram, since the histogram
+        * excludes the MCVs.  However, usually the MCVs will not be the
+        * extreme values, so avoid unnecessary data copying.
+        */
+       if (get_attstatsslot(vardata->statsTuple,
+                                                vardata->atttype, vardata->atttypmod,
+                                                STATISTIC_KIND_MCV, InvalidOid,
+                                                &values, &nvalues,
+                                                NULL, NULL))
+       {
+               bool            large_mcv = false;
+               FmgrInfo        opproc;
+
+               fmgr_info(get_opcode(sortop), &opproc);
+
+               for (i = 0; i < nvalues; i++)
+               {
+                       if (!have_max)
+                       {
+                               tmax = values[i];
+                               large_mcv = have_max = true;
+                       }
+                       else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
+                       {
+                               tmax = values[i];
+                               large_mcv = true;
+                       }
+               }
+               if (large_mcv)
+                       tmax = datumCopy(tmax, typByVal, typLen);
+               free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+       }
+
+       *max = tmax;
+       return have_max;
  }
  
+
  /*-------------------------------------------------------------------------
   *
   * Pattern analysis functions
@@ -3387,10 +3630,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
   * Estimate the selectivity of a fixed prefix for a pattern match.
   *
   * A fixed prefix "foo" is estimated as the selectivity of the expression
- * "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
+ * "variable >= 'foo' AND variable < 'fop'" (see also indxqual.c).
   *
   * We use the >= and < operators from the specified btree opclass to do the
- * estimation. The given Var and Const must be of the associated datatype.
+ * estimation. The given variable and Const must be of the associated
+ * datatype.
   *
   * XXX Note: we make use of the upper bound to estimate operator selectivity
   * even if the locale is such that we cannot rely on the upper-bound string.
@@ -3398,7 +3642,8 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
   * more useful to use the upper-bound code than not.
   */
  static Selectivity
-prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
+prefix_selectivity(Query *root, VariableStatData *vardata,
+                                  Oid opclass, Const *prefixcon)
  {
         Selectivity prefixsel;
         Oid                     cmpopr;
@@ -3409,7 +3654,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
                                                                 BTGreaterEqualStrategyNumber);
         if (cmpopr == InvalidOid)
                 elog(ERROR, "no >= operator for opclass %u", opclass);
-       cmpargs = makeList2(var, prefixcon);
+       cmpargs = makeList2(vardata->var, prefixcon);
         /* Assume scalargtsel is appropriate for all supported types */
         prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
                                                                                                    PointerGetDatum(root),
@@ -3431,7 +3676,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
                                                                         BTLessStrategyNumber);
                 if (cmpopr == InvalidOid)
                         elog(ERROR, "no < operator for opclass %u", opclass);
-               cmpargs = makeList2(var, greaterstrcon);
+               cmpargs = makeList2(vardata->var, greaterstrcon);
                 /* Assume scalarltsel is appropriate for all supported types */
                 topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
                                                                                                         PointerGetDatum(root),
@@ -3446,7 +3691,7 @@ prefix_selectivity(Query *root, Var *var, Oid opclass, Const *prefixcon)
                 prefixsel = topsel + prefixsel - 1.0;
  
                 /* Adjust for double-exclusion of NULLs */
-               prefixsel += nulltestsel(root, IS_NULL, (Node *) var, var->varno);
+               prefixsel += nulltestsel(root, IS_NULL, vardata->var, 0);
  
                 /*
                  * A zero or slightly negative prefixsel should be converted into
@@ -4034,56 +4279,69 @@ btcostestimate(PG_FUNCTION_ARGS)
         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
+       Oid                     relid;
+       AttrNumber      colnum;
+       HeapTuple       tuple;
  
         genericcostestimate(root, rel, index, indexQuals,
                                                 indexStartupCost, indexTotalCost,
                                                 indexSelectivity, indexCorrelation);
  
         /*
-        * If the first column is a simple variable, and we can get an
-        * estimate for its ordering correlation C from pg_statistic, estimate
-        * the index correlation as C / number-of-columns. (The idea here is
+        * If we can get an estimate of the first column's ordering correlation C
+        * from pg_statistic, estimate the index correlation as C for a single-
+        * column index, or C * 0.75 for multiple columns.  (The idea here is
          * that multiple columns dilute the importance of the first column's
-        * ordering, but don't negate it entirely.)
+        * ordering, but don't negate it entirely.  Before 7.5 we divided the
+        * correlation by the number of columns, but that seems too strong.)
          */
         if (index->indexkeys[0] != 0)
         {
-               Oid                     relid;
-               HeapTuple       tuple;
-
+               /* Simple variable --- look to stats for the underlying table */
                 relid = getrelid(rel->relid, root->rtable);
                 Assert(relid != InvalidOid);
-               tuple = SearchSysCache(STATRELATT,
-                                                          ObjectIdGetDatum(relid),
-                                                          Int16GetDatum(index->indexkeys[0]),
-                                                          0, 0);
-               if (HeapTupleIsValid(tuple))
+               colnum = index->indexkeys[0];
+       }
+       else
+       {
+               /* Expression --- maybe there are stats for the index itself */
+               relid = index->indexoid;
+               colnum = 1;
+       }
+
+       tuple = SearchSysCache(STATRELATT,
+                                                  ObjectIdGetDatum(relid),
+                                                  Int16GetDatum(colnum),
+                                                  0, 0);
+
+       if (HeapTupleIsValid(tuple))
+       {
+               Oid                     typid;
+               int32           typmod;
+               float4     *numbers;
+               int                     nnumbers;
+
+               /* XXX this code would break with different storage type */
+               get_atttypetypmod(relid, colnum, &typid, &typmod);
+
+               if (get_attstatsslot(tuple, typid, typmod,
+                                                        STATISTIC_KIND_CORRELATION,
+                                                        index->ordering[0],
+                                                        NULL, NULL, &numbers, &nnumbers))
                 {
-                       Oid                     typid;
-                       int32           typmod;
-                       float4     *numbers;
-                       int                     nnumbers;
-
-                       get_atttypetypmod(relid, index->indexkeys[0],
-                                                         &typid, &typmod);
-                       if (get_attstatsslot(tuple, typid, typmod,
-                                                                STATISTIC_KIND_CORRELATION,
-                                                                index->ordering[0],
-                                                                NULL, NULL, &numbers, &nnumbers))
-                       {
-                               double          varCorrelation;
-                               int                     nKeys;
+                       double          varCorrelation;
  
-                               Assert(nnumbers == 1);
-                               varCorrelation = numbers[0];
-                               nKeys = index->ncolumns;
+                       Assert(nnumbers == 1);
+                       varCorrelation = numbers[0];
  
-                               *indexCorrelation = varCorrelation / nKeys;
+                       if (index->ncolumns > 1)
+                               *indexCorrelation = varCorrelation * 0.75;
+                       else
+                               *indexCorrelation = varCorrelation;
  
-                               free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
-                       }
-                       ReleaseSysCache(tuple);
+                       free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
                 }
+               ReleaseSysCache(tuple);
         }
  
         PG_RETURN_VOID();
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h

index 379e2ba7a5e614e3fb38352e7523d4f99a9a71db..3186b8d1c1f6e7aca0f43d265279cb4b1eb65e43 100644 (file)
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.53 2003/11/29 22:41:07 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.54 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -77,6 +77,7 @@ extern HashPath *create_hashjoin_path(Query *root,
  extern void build_base_rel(Query *root, int relid);
  extern RelOptInfo *build_other_rel(Query *root, int relid);
  extern RelOptInfo *find_base_rel(Query *root, int relid);
+extern RelOptInfo *find_join_rel(Query *root, Relids relids);
  extern RelOptInfo *build_join_rel(Query *root,
                            Relids joinrelids,
                            RelOptInfo *outer_rel,
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index 873af8b9876bc15b642ebd96a5b231e0a1f9a15d..797e0a4c700cc735f5a9d39c3d9df47b56aea5a0 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.16 2003/11/29 22:41:16 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.17 2004/02/17 00:52:53 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -77,6 +77,9 @@ extern void mergejoinscansel(Query *root, Node *clause,
  extern double estimate_num_groups(Query *root, List *groupExprs,
                                         double input_rows);
  
+extern Selectivity estimate_hash_bucketsize(Query *root, Node *hashkey,
+                                                                                       int nbuckets);
+
  extern Datum btcostestimate(PG_FUNCTION_ARGS);
  extern Datum rtcostestimate(PG_FUNCTION_ARGS);
  extern Datum hashcostestimate(PG_FUNCTION_ARGS);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Tue, 17 Feb 2004 00:52:53 +0000 (00:52 +0000)
src/backend/optimizer/path/costsize.c		patch \| blob \| history
src/backend/optimizer/util/relnode.c		patch \| blob \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| history
src/include/optimizer/pathnode.h		patch \| blob \| history
src/include/utils/selfuncs.h		patch \| blob \| history