Add support for EUI-64 MAC addresses as macaddr8

[postgresql] / src / backend / utils / adt / selfuncs.c
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index f70bdaa328f8f81f6c6cc9f88895eb0670416b6f..bb9a5446861bde72e1caee147ef7e0c382b8de16 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -7,10 +7,10 @@
   *       Selectivity routines are registered in the pg_operator catalog
   *       in the "oprrest" and "oprjoin" attributes.
   *
- *       Index cost functions are registered in the pg_am catalog
- *       in the "amcostestimate" attribute.
+ *       Index cost functions are located via the index AM's API struct,
+ *       which is obtained from the handler function registered in pg_am.
   *
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@ -72,7 +72,7 @@
   *             float8 oprjoin (internal, oid, internal, int2, internal);
   *
   * (Before Postgres 8.4, join estimators had only the first four of these
- * parameters. That signature is still allowed, but deprecated.)  The
+ * parameters.  That signature is still allowed, but deprecated.)  The
   * relationship between jointype and sjinfo is explained in the comments for
   * clause_selectivity() --- the short version is that jointype is usually
   * best ignored in favor of examining sjinfo.
@@ -98,13 +98,16 @@
  #include "postgres.h"
  
  #include <ctype.h>
+#include <float.h>
  #include <math.h>
  
  #include "access/gin.h"
  #include "access/htup_details.h"
  #include "access/sysattr.h"
  #include "catalog/index.h"
+#include "catalog/pg_am.h"
  #include "catalog/pg_collation.h"
+#include "catalog/pg_operator.h"
  #include "catalog/pg_opfamily.h"
  #include "catalog/pg_statistic.h"
  #include "catalog/pg_type.h"
@@ -128,17 +131,18 @@
  #include "utils/date.h"
  #include "utils/datum.h"
  #include "utils/fmgroids.h"
+#include "utils/index_selfuncs.h"
  #include "utils/lsyscache.h"
  #include "utils/nabstime.h"
  #include "utils/pg_locale.h"
  #include "utils/rel.h"
  #include "utils/selfuncs.h"
-#include "utils/snapmgr.h"
  #include "utils/spccache.h"
  #include "utils/syscache.h"
  #include "utils/timestamp.h"
  #include "utils/tqual.h"
  #include "utils/typcache.h"
+#include "utils/varlena.h"
  
  
  /* Hooks for plugins to get control when we ask for stats */
@@ -210,7 +214,7 @@ static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
   *
   * Note: this routine is also used to estimate selectivity for some
   * operators that are not "=" but have comparable selectivity behavior,
- * such as "~=" (geometric approximate-match). Even for "=", we must
+ * such as "~=" (geometric approximate-match).  Even for "=", we must
   * keep in mind that the left and right datatypes may differ.
   */
  Datum
@@ -274,7 +278,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
  
         /*
          * If we matched the var to a unique index or DISTINCT clause, assume
-        * there is exactly one match regardless of anything else.      (This is
+        * there is exactly one match regardless of anything else.  (This is
          * slightly bogus, since the index or clause's equality operator might be
          * different from ours, but it's much more likely to be right than
          * ignoring the information.)
@@ -297,7 +301,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
                 /*
                  * Is the constant "=" to any of the column's most common values?
                  * (Although the given operator may not really be "=", we will assume
-                * that seeing whether it returns TRUE is an appropriate test.  If you
+                * that seeing whether it returns TRUE is an appropriate test.  If you
                  * don't like this, maybe you shouldn't be using eqsel for your
                  * operator...)
                  */
@@ -409,7 +413,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
  
         /*
          * If we matched the var to a unique index or DISTINCT clause, assume
-        * there is exactly one match regardless of anything else.      (This is
+        * there is exactly one match regardless of anything else.  (This is
          * slightly bogus, since the index or clause's equality operator might be
          * different from ours, but it's much more likely to be right than
          * ignoring the information.)
@@ -433,7 +437,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
                  * result averaged over all possible values whether common or
                  * uncommon.  (Essentially, we are assuming that the not-yet-known
                  * comparison value is equally likely to be any of the possible
-                * values, regardless of their frequency in the table.  Is that a good
+                * values, regardless of their frequency in the table.  Is that a good
                  * idea?)
                  */
                 selec = 1.0 - stats->stanullfrac;
@@ -656,7 +660,7 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
   * essentially using the histogram just as a representative sample.  However,
   * small histograms are unlikely to be all that representative, so the caller
   * should be prepared to fall back on some other estimation approach when the
- * histogram is missing or very small. It may also be prudent to combine this
+ * histogram is missing or very small.  It may also be prudent to combine this
   * approach with another one when the histogram is small.
   *
   * If the actual histogram size is not at least min_hist_size, we won't bother
@@ -674,7 +678,7 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
   *
   * Note that the result disregards both the most-common-values (if any) and
   * null entries.  The caller is expected to combine this result with
- * statistics for those portions of the column population.     It may also be
+ * statistics for those portions of the column population.  It may also be
   * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
   */
  double
@@ -787,7 +791,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
                          *
                          * If the binary search accesses the first or last histogram
                          * entry, we try to replace that endpoint with the true column min
-                        * or max as found by get_actual_variable_range().      This
+                        * or max as found by get_actual_variable_range().  This
                          * ameliorates misestimates when the min or max is moving as a
                          * result of changes since the last ANALYZE.  Note that this could
                          * result in effectively including MCVs into the histogram that
@@ -891,7 +895,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
  
                                                 /*
                                                  * Watch out for the possibility that we got a NaN or
-                                                * Infinity from the division.  This can happen
+                                                * Infinity from the division.  This can happen
                                                  * despite the previous checks, if for example "low"
                                                  * is -Infinity.
                                                  */
@@ -906,7 +910,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
                                          * Ideally we'd produce an error here, on the grounds that
                                          * the given operator shouldn't have scalarXXsel
                                          * registered as its selectivity func unless we can deal
-                                        * with its operand types.      But currently, all manner of
+                                        * with its operand types.  But currently, all manner of
                                          * stuff is invoking scalarXXsel, so give a default
                                          * estimate until that can be fixed.
                                          */
@@ -932,7 +936,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
  
                         /*
                          * The histogram boundaries are only approximate to begin with,
-                        * and may well be out of date anyway.  Therefore, don't believe
+                        * and may well be out of date anyway.  Therefore, don't believe
                          * extremely small or large selectivity estimates --- unless we
                          * got actual current endpoint values from the table.
                          */
@@ -1129,7 +1133,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
  
         /*
          * If this is for a NOT LIKE or similar operator, get the corresponding
-        * positive-match operator and work with that.  Set result to the correct
+        * positive-match operator and work with that.  Set result to the correct
          * default estimate, too.
          */
         if (negate)
@@ -1215,7 +1219,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
  
         /*
          * Pull out any fixed prefix implied by the pattern, and estimate the
-        * fractional selectivity of the remainder of the pattern.      Unlike many of
+        * fractional selectivity of the remainder of the pattern.  Unlike many of
          * the other functions in this file, we use the pattern operator's actual
          * collation for this step.  This is not because we expect the collation
          * to make a big difference in the selectivity estimate (it seldom would),
@@ -1333,7 +1337,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
                 /*
                  * If we have most-common-values info, add up the fractions of the MCV
                  * entries that satisfy MCV OP PATTERN.  These fractions contribute
-                * directly to the result selectivity.  Also add up the total fraction
+                * directly to the result selectivity.  Also add up the total fraction
                  * represented by MCV entries.
                  */
                 mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
@@ -1440,6 +1444,50 @@ icnlikesel(PG_FUNCTION_ARGS)
         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
  }
  
+/*
+ *             boolvarsel              - Selectivity of Boolean variable.
+ *
+ * This can actually be called on any boolean-valued expression.  If it
+ * involves only Vars of the specified relation, and if there are statistics
+ * about the Var or expression (the latter is possible if it's indexed) then
+ * we'll produce a real estimate; otherwise it's just a default.
+ */
+Selectivity
+boolvarsel(PlannerInfo *root, Node *arg, int varRelid)
+{
+       VariableStatData vardata;
+       double          selec;
+
+       examine_variable(root, arg, varRelid, &vardata);
+       if (HeapTupleIsValid(vardata.statsTuple))
+       {
+               /*
+                * A boolean variable V is equivalent to the clause V = 't', so we
+                * compute the selectivity as if that is what we have.
+                */
+               selec = var_eq_const(&vardata, BooleanEqualOperator,
+                                                        BoolGetDatum(true), false, true);
+       }
+       else if (is_funcclause(arg))
+       {
+               /*
+                * If we have no stats and it's a function call, estimate 0.3333333.
+                * This seems a pretty unprincipled choice, but Postgres has been
+                * using that estimate for function calls since 1992.  The hoariness
+                * of this behavior suggests that we should not be in too much hurry
+                * to use another value.
+                */
+               selec = 0.3333333;
+       }
+       else
+       {
+               /* Otherwise, the default estimate is 0.5 */
+               selec = 0.5;
+       }
+       ReleaseVariableStats(vardata);
+       return selec;
+}
+
  /*
   *             booltestsel             - Selectivity of BooleanTest Node.
   */
@@ -1839,7 +1887,7 @@ scalararraysel(PlannerInfo *root,
  
                 /*
                  * For generic operators, we assume the probability of success is
-                * independent for each array element.  But for "= ANY" or "<> ALL",
+                * independent for each array element.  But for "= ANY" or "<> ALL",
                  * if the array elements are distinct (which'd typically be the case)
                  * then the probabilities are disjoint, and we should just sum them.
                  *
@@ -2254,9 +2302,9 @@ eqjoinsel_inner(Oid operator,
         if (have_mcvs1 && have_mcvs2)
         {
                 /*
-                * We have most-common-value lists for both relations.  Run through
+                * We have most-common-value lists for both relations.  Run through
                  * the lists to see which MCVs actually join to each other with the
-                * given operator.      This allows us to determine the exact join
+                * given operator.  This allows us to determine the exact join
                  * selectivity for the portion of the relations represented by the MCV
                  * lists.  We still have to estimate for the remaining population, but
                  * in a skewed distribution this gives us a big leg up in accuracy.
@@ -2288,7 +2336,7 @@ eqjoinsel_inner(Oid operator,
  
                 /*
                  * Note we assume that each MCV will match at most one member of the
-                * other MCV list.      If the operator isn't really equality, there could
+                * other MCV list.  If the operator isn't really equality, there could
                  * be multiple matches --- but we don't look for them, both for speed
                  * and because the math wouldn't add up...
                  */
@@ -2453,7 +2501,7 @@ eqjoinsel_semi(Oid operator,
  
         /*
          * We clamp nd2 to be not more than what we estimate the inner relation's
-        * size to be.  This is intuitively somewhat reasonable since obviously
+        * size to be.  This is intuitively somewhat reasonable since obviously
          * there can't be more than that many distinct values coming from the
          * inner rel.  The reason for the asymmetry (ie, that we don't clamp nd1
          * likewise) is that this is the only pathway by which restriction clauses
@@ -2465,10 +2513,24 @@ eqjoinsel_semi(Oid operator,
          * We can apply this clamping both with respect to the base relation from
          * which the join variable comes (if there is just one), and to the
          * immediate inner input relation of the current join.
+        *
+        * If we clamp, we can treat nd2 as being a non-default estimate; it's not
+        * great, maybe, but it didn't come out of nowhere either.  This is most
+        * helpful when the inner relation is empty and consequently has no stats.
          */
         if (vardata2->rel)
-               nd2 = Min(nd2, vardata2->rel->rows);
-       nd2 = Min(nd2, inner_rel->rows);
+       {
+               if (nd2 >= vardata2->rel->rows)
+               {
+                       nd2 = vardata2->rel->rows;
+                       isdefault2 = false;
+               }
+       }
+       if (nd2 >= inner_rel->rows)
+       {
+               nd2 = inner_rel->rows;
+               isdefault2 = false;
+       }
  
         if (HeapTupleIsValid(vardata1->statsTuple))
         {
@@ -2498,9 +2560,9 @@ eqjoinsel_semi(Oid operator,
         if (have_mcvs1 && have_mcvs2 && OidIsValid(operator))
         {
                 /*
-                * We have most-common-value lists for both relations.  Run through
+                * We have most-common-value lists for both relations.  Run through
                  * the lists to see which MCVs actually join to each other with the
-                * given operator.      This allows us to determine the exact join
+                * given operator.  This allows us to determine the exact join
                  * selectivity for the portion of the relations represented by the MCV
                  * lists.  We still have to estimate for the remaining population, but
                  * in a skewed distribution this gives us a big leg up in accuracy.
@@ -2531,7 +2593,7 @@ eqjoinsel_semi(Oid operator,
  
                 /*
                  * Note we assume that each MCV will match at most one member of the
-                * other MCV list.      If the operator isn't really equality, there could
+                * other MCV list.  If the operator isn't really equality, there could
                  * be multiple matches --- but we don't look for them, both for speed
                  * and because the math wouldn't add up...
                  */
@@ -2568,7 +2630,7 @@ eqjoinsel_semi(Oid operator,
  
                 /*
                  * Now we need to estimate the fraction of relation 1 that has at
-                * least one join partner.      We know for certain that the matched MCVs
+                * least one join partner.  We know for certain that the matched MCVs
                  * do, so that gives us a lower bound, but we're really in the dark
                  * about everything else.  Our crude approach is: if nd1 <= nd2 then
                  * assume all non-null rel1 rows have join partners, else assume for
@@ -3159,6 +3221,8 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   *     groupExprs - list of expressions being grouped by
   *     input_rows - number of rows estimated to arrive at the group/unique
   *             filter step
+ *     pgset - NULL, or a List** pointing to a grouping set to filter the
+ *             groupExprs against
   *
   * Given the lack of any cross-correlation statistics in the system, it's
   * impossible to do anything really trustworthy with GROUP BY conditions
@@ -3166,11 +3230,11 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   * case (all possible cross-product terms actually appear as groups) since
   * very often the grouped-by Vars are highly correlated.  Our current approach
   * is as follows:
- *     1.      Expressions yielding boolean are assumed to contribute two groups,
+ *     1.  Expressions yielding boolean are assumed to contribute two groups,
   *             independently of their content, and are ignored in the subsequent
- *             steps.  This is mainly because tests like "col IS NULL" break the
+ *             steps.  This is mainly because tests like "col IS NULL" break the
   *             heuristic used in step 2 especially badly.
- *     2.      Reduce the given expressions to a list of unique Vars used.  For
+ *     2.  Reduce the given expressions to a list of unique Vars used.  For
   *             example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
   *             It is clearly correct not to count the same Var more than once.
   *             It is also reasonable to treat f(x) the same as x: f() cannot
@@ -3180,25 +3244,25 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   *             As a special case, if a GROUP BY expression can be matched to an
   *             expressional index for which we have statistics, then we treat the
   *             whole expression as though it were just a Var.
- *     3.      If the list contains Vars of different relations that are known equal
+ *     3.  If the list contains Vars of different relations that are known equal
   *             due to equivalence classes, then drop all but one of the Vars from each
   *             known-equal set, keeping the one with smallest estimated # of values
   *             (since the extra values of the others can't appear in joined rows).
   *             Note the reason we only consider Vars of different relations is that
   *             if we considered ones of the same rel, we'd be double-counting the
   *             restriction selectivity of the equality in the next step.
- *     4.      For Vars within a single source rel, we multiply together the numbers
+ *     4.  For Vars within a single source rel, we multiply together the numbers
   *             of values, clamp to the number of rows in the rel (divided by 10 if
- *             more than one Var), and then multiply by the selectivity of the
- *             restriction clauses for that rel.  When there's more than one Var,
- *             the initial product is probably too high (it's the worst case) but
- *             clamping to a fraction of the rel's rows seems to be a helpful
- *             heuristic for not letting the estimate get out of hand.  (The factor
- *             of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
- *             by the restriction selectivity is effectively assuming that the
- *             restriction clauses are independent of the grouping, which is a crummy
- *             assumption, but it's hard to do better.
- *     5.      If there are Vars from multiple rels, we repeat step 4 for each such
+ *             more than one Var), and then multiply by a factor based on the
+ *             selectivity of the restriction clauses for that rel.  When there's
+ *             more than one Var, the initial product is probably too high (it's the
+ *             worst case) but clamping to a fraction of the rel's rows seems to be a
+ *             helpful heuristic for not letting the estimate get out of hand.  (The
+ *             factor of 10 is derived from pre-Postgres-7.4 practice.)  The factor
+ *             we multiply by to adjust for the restriction selectivity assumes that
+ *             the restriction clauses are independent of the grouping, which may not
+ *             be a valid assumption, but it's hard to do better.
+ *     5.  If there are Vars from multiple rels, we repeat step 4 for each such
   *             rel, and multiply the results together.
   * Note that rels not containing grouped Vars are ignored completely, as are
   * join clauses.  Such rels cannot increase the number of groups, and we
@@ -3206,11 +3270,13 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   * but we don't have the info to do better).
   */
  double
-estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
+estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
+                                       List **pgset)
  {
         List       *varinfos = NIL;
         double          numdistinct;
         ListCell   *l;
+       int                     i;
  
         /*
          * We don't ever want to return an estimate of zero groups, as that tends
@@ -3225,11 +3291,11 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
          * for normal cases with GROUP BY or DISTINCT, but it is possible for
          * corner cases with set operations.)
          */
-       if (groupExprs == NIL)
+       if (groupExprs == NIL || (pgset && list_length(*pgset) < 1))
                 return 1.0;
  
         /*
-        * Count groups derived from boolean grouping expressions.      For other
+        * Count groups derived from boolean grouping expressions.  For other
          * expressions, find the unique Vars used, treating an expression as a Var
          * if we can find stats for it.  For each one, record the statistical
          * estimate of number of distinct values (total in its table, without
@@ -3237,6 +3303,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
          */
         numdistinct = 1.0;
  
+       i = 0;
         foreach(l, groupExprs)
         {
                 Node       *groupexpr = (Node *) lfirst(l);
@@ -3244,6 +3311,10 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
                 List       *varshere;
                 ListCell   *l2;
  
+               /* is expression in this grouping set? */
+               if (pgset && !list_member_int(*pgset, i++))
+                       continue;
+
                 /* Short-circuit for expressions returning boolean */
                 if (exprType(groupexpr) == BOOLOID)
                 {
@@ -3273,7 +3344,8 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
                  * down to ignoring the possible addition of nulls to the result set).
                  */
                 varshere = pull_var_clause(groupexpr,
-                                                                  PVC_RECURSE_AGGREGATES,
+                                                                  PVC_RECURSE_AGGREGATES |
+                                                                  PVC_RECURSE_WINDOWFUNCS |
                                                                    PVC_RECURSE_PLACEHOLDERS);
  
                 /*
@@ -3318,7 +3390,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
          * Group Vars by relation and estimate total numdistinct.
          *
          * For each iteration of the outer loop, we process the frontmost Var in
-        * varinfos, plus all other Vars in the same relation.  We remove these
+        * varinfos, plus all other Vars in the same relation.  We remove these
          * Vars from the newvarinfos list for the next iteration. This is the
          * easiest way to group Vars of same rel together.
          */
@@ -3383,9 +3455,51 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
                                 reldistinct = clamp;
  
                         /*
-                        * Multiply by restriction selectivity.
+                        * Update the estimate based on the restriction selectivity,
+                        * guarding against division by zero when reldistinct is zero.
+                        * Also skip this if we know that we are returning all rows.
                          */
-                       reldistinct *= rel->rows / rel->tuples;
+                       if (reldistinct > 0 && rel->rows < rel->tuples)
+                       {
+                               /*
+                                * Given a table containing N rows with n distinct values in a
+                                * uniform distribution, if we select p rows at random then
+                                * the expected number of distinct values selected is
+                                *
+                                * n * (1 - product((N-N/n-i)/(N-i), i=0..p-1))
+                                *
+                                * = n * (1 - (N-N/n)! / (N-N/n-p)! * (N-p)! / N!)
+                                *
+                                * See "Approximating block accesses in database
+                                * organizations", S. B. Yao, Communications of the ACM,
+                                * Volume 20 Issue 4, April 1977 Pages 260-261.
+                                *
+                                * Alternatively, re-arranging the terms from the factorials,
+                                * this may be written as
+                                *
+                                * n * (1 - product((N-p-i)/(N-i), i=0..N/n-1))
+                                *
+                                * This form of the formula is more efficient to compute in
+                                * the common case where p is larger than N/n.  Additionally,
+                                * as pointed out by Dell'Era, if i << N for all terms in the
+                                * product, it can be approximated by
+                                *
+                                * n * (1 - ((N-p)/N)^(N/n))
+                                *
+                                * See "Expected distinct values when selecting from a bag
+                                * without replacement", Alberto Dell'Era,
+                                * http://www.adellera.it/investigations/distinct_balls/.
+                                *
+                                * The condition i << N is equivalent to n >> 1, so this is a
+                                * good approximation when the number of distinct values in
+                                * the table is large.  It turns out that this formula also
+                                * works well even when n is small.
+                                */
+                               reldistinct *=
+                                       (1 - pow((rel->tuples - rel->rows) / rel->tuples,
+                                                        rel->tuples / reldistinct));
+                       }
+                       reldistinct = clamp_row_est(reldistinct);
  
                         /*
                          * Update estimate of total distinct groups.
@@ -3419,11 +3533,11 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
   * distribution, so this will have to do for now.
   *
   * We are passed the number of buckets the executor will use for the given
- * input relation.     If the data were perfectly distributed, with the same
+ * input relation.  If the data were perfectly distributed, with the same
   * number of tuples going into each available bucket, then the bucketsize
   * fraction would be 1/nbuckets.  But this happy state of affairs will occur
   * only if (a) there are at least nbuckets distinct data values, and (b)
- * we have a not-too-skewed data distribution. Otherwise the buckets will
+ * we have a not-too-skewed data distribution.  Otherwise the buckets will
   * be nonuniformly occupied.  If the other relation in the join has a key
   * distribution similar to this one's, then the most-loaded buckets are
   * exactly those that will be probed most often.  Therefore, the "average"
@@ -3484,8 +3598,11 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
          * XXX Possibly better way, but much more expensive: multiply by
          * selectivity of rel's restriction clauses that mention the target Var.
          */
-       if (vardata.rel)
+       if (vardata.rel && vardata.rel->tuples > 0)
+       {
                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
+               ndistinct = clamp_row_est(ndistinct);
+       }
  
         /*
          * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
@@ -3596,7 +3713,7 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
          * operators to estimate selectivity for the other's.  This is outright
          * wrong in some cases --- in particular signed versus unsigned
          * interpretation could trip us up.  But it's useful enough in the
-        * majority of cases that we do it anyway.      Should think about more
+        * majority of cases that we do it anyway.  Should think about more
          * rigorous ways to do it.
          */
         switch (valuetypid)
@@ -3620,6 +3737,8 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                 case REGTYPEOID:
                 case REGCONFIGOID:
                 case REGDICTIONARYOID:
+               case REGROLEOID:
+               case REGNAMESPACEOID:
                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
@@ -3681,6 +3800,7 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                 case INETOID:
                 case CIDROID:
                 case MACADDROID:
+               case MACADDR8OID:
                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
@@ -3725,6 +3845,8 @@ convert_numeric_to_scalar(Datum value, Oid typid)
                 case REGTYPEOID:
                 case REGCONFIGOID:
                 case REGDICTIONARYOID:
+               case REGROLEOID:
+               case REGNAMESPACEOID:
                         /* we can treat OIDs as integers... */
                         return (double) DatumGetObjectId(value);
         }
@@ -3849,10 +3971,16 @@ convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
                 return 0.0;                             /* empty string has scalar value 0 */
  
         /*
-        * Since base is at least 10, need not consider more than about 20 chars
+        * There seems little point in considering more than a dozen bytes from
+        * the string.  Since base is at least 10, that will give us nominal
+        * resolution of at least 12 decimal digits, which is surely far more
+        * precision than this estimation technique has got anyway (especially in
+        * non-C locales).  Also, even with the maximum possible base of 256, this
+        * ensures denom cannot grow larger than 256^13 = 2.03e31, which will not
+        * overflow on any known machine.
          */
-       if (slen > 20)
-               slen = 20;
+       if (slen > 12)
+               slen = 12;
  
         /* Convert initial characters to fraction */
         base = rangehi - rangelo + 1;
@@ -3920,16 +4048,8 @@ convert_string_datum(Datum value, Oid typid)
                 size_t xfrmlen2 PG_USED_FOR_ASSERTS_ONLY;
  
                 /*
-                * Note: originally we guessed at a suitable output buffer size, and
-                * only needed to call strxfrm twice if our guess was too small.
-                * However, it seems that some versions of Solaris have buggy strxfrm
-                * that can write past the specified buffer length in that scenario.
-                * So, do it the dumb way for portability.
-                *
-                * Yet other systems (e.g., glibc) sometimes return a smaller value
-                * from the second call than the first; thus the Assert must be <= not
-                * == as you'd expect.  Can't any of these people program their way
-                * out of a paper bag?
+                * XXX: We could guess at a suitable output buffer size and only call
+                * strxfrm twice if our guess is too small.
                  *
                  * XXX: strxfrm doesn't support UTF-8 encoding on Win32, it can return
                  * bogus data or set an error. This is not really a problem unless it
@@ -3962,6 +4082,11 @@ convert_string_datum(Datum value, Oid typid)
  #endif
                 xfrmstr = (char *) palloc(xfrmlen + 1);
                 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
+
+               /*
+                * Some systems (e.g., glibc) can return a smaller value from the
+                * second call than the first; thus the Assert must be <= not ==.
+                */
                 Assert(xfrmlen2 <= xfrmlen);
                 pfree(val);
                 val = xfrmstr;
@@ -4089,31 +4214,17 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
                                  * average month length of 365.25/12.0 days.  Not too
                                  * accurate, but plenty good enough for our purposes.
                                  */
-#ifdef HAVE_INT64_TIMESTAMP
                                 return interval->time + interval->day * (double) USECS_PER_DAY +
                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
-#else
-                               return interval->time + interval->day * SECS_PER_DAY +
-                                       interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
-#endif
                         }
                 case RELTIMEOID:
-#ifdef HAVE_INT64_TIMESTAMP
                         return (DatumGetRelativeTime(value) * 1000000.0);
-#else
-                       return DatumGetRelativeTime(value);
-#endif
                 case TINTERVALOID:
                         {
                                 TimeInterval tinterval = DatumGetTimeInterval(value);
  
-#ifdef HAVE_INT64_TIMESTAMP
                                 if (tinterval->status != 0)
                                         return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
-#else
-                               if (tinterval->status != 0)
-                                       return tinterval->data[1] - tinterval->data[0];
-#endif
                                 return 0;               /* for lack of a better idea */
                         }
                 case TIMEOID:
@@ -4123,11 +4234,7 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
  
                                 /* use GMT-equivalent time */
-#ifdef HAVE_INT64_TIMESTAMP
                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
-#else
-                               return (double) (timetz->time + timetz->zone);
-#endif
                         }
         }
  
@@ -4180,7 +4287,7 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
         right = (Node *) lsecond(args);
  
         /*
-        * Examine both sides.  Note that when varRelid is nonzero, Vars of other
+        * Examine both sides.  Note that when varRelid is nonzero, Vars of other
          * relations will be treated as pseudoconstants.
          */
         examine_variable(root, left, varRelid, vardata);
@@ -4206,7 +4313,7 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
                 return true;
         }
  
-       /* Ooops, clause has wrong structure (probably var op var) */
+       /* Oops, clause has wrong structure (probably var op var) */
         ReleaseVariableStats(*vardata);
         ReleaseVariableStats(rdata);
  
@@ -4325,7 +4432,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
  
         /*
          * Okay, it's a more complicated expression.  Determine variable
-        * membership.  Note that when varRelid isn't zero, only vars of that
+        * membership.  Note that when varRelid isn't zero, only vars of that
          * relation are considered "real" vars.
          */
         varnos = pull_varnos(basenode);
@@ -4374,13 +4481,13 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
         if (onerel)
         {
                 /*
-                * We have an expression in vars of a single relation.  Try to match
+                * We have an expression in vars of a single relation.  Try to match
                  * it to expressional index columns, in hopes of finding some
                  * statistics.
                  *
                  * XXX it's conceivable that there are multiple matches with different
                  * index opfamilies; if so, we need to pick one that matches the
-                * operator we are estimating for.      FIXME later.
+                * operator we are estimating for.  FIXME later.
                  */
                 ListCell   *ilist;
  
@@ -4582,7 +4689,7 @@ examine_simple_variable(PlannerInfo *root, Var *var,
                  *
                  * This is probably a harsher restriction than necessary; it's
                  * certainly OK for the selectivity estimator (which is a C function,
-                * and therefore omnipotent anyway) to look at the statistics.  But
+                * and therefore omnipotent anyway) to look at the statistics.  But
                  * many selectivity estimators will happily *invoke the operator
                  * function* to try to work out a good estimate - and that's not OK.
                  * So for now, don't dig down for stats.
@@ -4623,21 +4730,23 @@ examine_simple_variable(PlannerInfo *root, Var *var,
   * *isdefault: set to TRUE if the result is a default rather than based on
   * anything meaningful.
   *
- * NB: be careful to produce an integral result, since callers may compare
- * the result to exact integer counts.
+ * NB: be careful to produce a positive integral result, since callers may
+ * compare the result to exact integer counts, or might divide by it.
   */
  double
  get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  {
         double          stadistinct;
+       double          stanullfrac = 0.0;
         double          ntuples;
  
         *isdefault = false;
  
         /*
-        * Determine the stadistinct value to use.      There are cases where we can
+        * Determine the stadistinct value to use.  There are cases where we can
          * get an estimate even without a pg_statistic entry, or can get a better
-        * value than is in pg_statistic.
+        * value than is in pg_statistic.  Grab stanullfrac too if we can find it
+        * (otherwise, assume no nulls, for lack of any better idea).
          */
         if (HeapTupleIsValid(vardata->statsTuple))
         {
@@ -4646,6 +4755,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  
                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
                 stadistinct = stats->stadistinct;
+               stanullfrac = stats->stanullfrac;
         }
         else if (vardata->vartype == BOOLOID)
         {
@@ -4669,7 +4779,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
                         {
                                 case ObjectIdAttributeNumber:
                                 case SelfItemPointerAttributeNumber:
-                                       stadistinct = -1.0; /* unique */
+                                       stadistinct = -1.0; /* unique (and all non null) */
                                         break;
                                 case TableOidAttributeNumber:
                                         stadistinct = 1.0;      /* only 1 value */
@@ -4691,16 +4801,17 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
          * If there is a unique index or DISTINCT clause for the variable, assume
          * it is unique no matter what pg_statistic says; the statistics could be
          * out of date, or we might have found a partial unique index that proves
-        * the var is unique for this query.
+        * the var is unique for this query.  However, we'd better still believe
+        * the null-fraction statistic.
          */
         if (vardata->isunique)
-               stadistinct = -1.0;
+               stadistinct = -1.0 * (1.0 - stanullfrac);
  
         /*
          * If we had an absolute estimate, use that.
          */
         if (stadistinct > 0.0)
-               return stadistinct;
+               return clamp_row_est(stadistinct);
  
         /*
          * Otherwise we need to get the relation size; punt if not available.
@@ -4721,7 +4832,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
          * If we had a relative estimate, use that.
          */
         if (stadistinct < 0.0)
-               return floor((-stadistinct * ntuples) + 0.5);
+               return clamp_row_est(-stadistinct * ntuples);
  
         /*
          * With no data, estimate ndistinct = ntuples if the table is small, else
@@ -4729,7 +4840,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
          * that the behavior isn't discontinuous.
          */
         if (ntuples < DEFAULT_NUM_DISTINCT)
-               return ntuples;
+               return clamp_row_est(ntuples);
  
         *isdefault = true;
         return DEFAULT_NUM_DISTINCT;
@@ -4759,7 +4870,7 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop,
  
         /*
          * XXX It's very tempting to try to use the actual column min and max, if
-        * we can get them relatively-cheaply with an index probe.      However, since
+        * we can get them relatively-cheaply with an index probe.  However, since
          * this function is called many times during join planning, that could
          * have unpleasant effects on planning speed.  Need more investigation
          * before enabling this.
@@ -4962,6 +5073,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         HeapTuple       tup;
                         Datum           values[INDEX_MAX_KEYS];
                         bool            isnull[INDEX_MAX_KEYS];
+                       SnapshotData SnapshotDirty;
  
                         estate = CreateExecutorState();
                         econtext = GetPerTupleExprContext(estate);
@@ -4984,6 +5096,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel));
                         econtext->ecxt_scantuple = slot;
                         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
+                       InitDirtySnapshot(SnapshotDirty);
  
                         /* set up an IS NOT NULL scan key so that we ignore nulls */
                         ScanKeyEntryInitialize(&scankeys[0],
@@ -5000,8 +5113,23 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         /* If min is requested ... */
                         if (min)
                         {
-                               index_scan = index_beginscan(heapRel, indexRel,
-                                                                                        GetActiveSnapshot(), 1, 0);
+                               /*
+                                * In principle, we should scan the index with our current
+                                * active snapshot, which is the best approximation we've got
+                                * to what the query will see when executed.  But that won't
+                                * be exact if a new snap is taken before running the query,
+                                * and it can be very expensive if a lot of uncommitted rows
+                                * exist at the end of the index (because we'll laboriously
+                                * fetch each one and reject it).  What seems like a good
+                                * compromise is to use SnapshotDirty.  That will accept
+                                * uncommitted rows, and thus avoid fetching multiple heap
+                                * tuples in this scenario.  On the other hand, it will reject
+                                * known-dead rows, and thus not give a bogus answer when the
+                                * extreme value has been deleted; that case motivates not
+                                * using SnapshotAny here.
+                                */
+                               index_scan = index_beginscan(heapRel, indexRel, &SnapshotDirty,
+                                                                                        1, 0);
                                 index_rescan(index_scan, scankeys, 1, NULL, 0);
  
                                 /* Fetch first tuple in sortop's direction */
@@ -5032,8 +5160,8 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         /* If max is requested, and we didn't find the index is empty */
                         if (max && have_data)
                         {
-                               index_scan = index_beginscan(heapRel, indexRel,
-                                                                                        GetActiveSnapshot(), 1, 0);
+                               index_scan = index_beginscan(heapRel, indexRel, &SnapshotDirty,
+                                                                                        1, 0);
                                 index_rescan(index_scan, scankeys, 1, NULL, 0);
  
                                 /* Fetch first tuple in reverse direction */
@@ -5132,7 +5260,7 @@ find_join_input_rel(PlannerInfo *root, Relids relids)
   * Check whether char is a letter (and, hence, subject to case-folding)
   *
   * In multibyte character sets, we can't use isalpha, and it does not seem
- * worth trying to convert to wchar_t to use iswalpha. Instead, just assume
+ * worth trying to convert to wchar_t to use iswalpha.  Instead, just assume
   * any multibyte char is potentially case-varying.
   */
  static int
@@ -5215,13 +5343,12 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
         }
         else
         {
-               bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
+               bytea      *bstr = DatumGetByteaPP(patt_const->constvalue);
  
-               pattlen = VARSIZE(bstr) - VARHDRSZ;
+               pattlen = VARSIZE_ANY_EXHDR(bstr);
                 patt = (char *) palloc(pattlen);
-               memcpy(patt, VARDATA(bstr), pattlen);
-               if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
-                       pfree(bstr);
+               memcpy(patt, VARDATA_ANY(bstr), pattlen);
+               Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
         }
  
         match = palloc(pattlen + 1);
@@ -5384,7 +5511,7 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
   * together with info about MCVs and NULLs.
   *
   * We use the >= and < operators from the specified btree opfamily to do the
- * estimation. The given variable and Const must be of the associated
+ * estimation.  The given variable and Const must be of the associated
   * datatype.
   *
   * XXX Note: we make use of the upper bound to estimate operator selectivity
@@ -5443,7 +5570,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
  
                 /*
                  * Merge the two selectivities in the same way as for a range query
-                * (see clauselist_selectivity()).      Note that we don't need to worry
+                * (see clauselist_selectivity()).  Note that we don't need to worry
                  * about double-exclusion of nulls, since ineq_histogram_selectivity
                  * doesn't count those anyway.
                  */
@@ -5680,7 +5807,7 @@ byte_increment(unsigned char *ptr, int len)
   * that is not a bulletproof guarantee that an extension of the string might
   * not sort after it; an example is that "foo " is less than "foo!", but it
   * is not clear that a "dictionary" sort ordering will consider "foo!" less
- * than "foo bar".     CAUTION: Therefore, this function should be used only for
+ * than "foo bar".  CAUTION: Therefore, this function should be used only for
   * estimation purposes when working in a non-C collation.
   *
   * To try to catch most cases where an extended string might otherwise sort
@@ -5731,13 +5858,12 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
         }
         else if (datatype == BYTEAOID)
         {
-               bytea      *bstr = DatumGetByteaP(str_const->constvalue);
+               bytea      *bstr = DatumGetByteaPP(str_const->constvalue);
  
-               len = VARSIZE(bstr) - VARHDRSZ;
+               len = VARSIZE_ANY_EXHDR(bstr);
                 workstr = (char *) palloc(len);
-               memcpy(workstr, VARDATA(bstr), len);
-               if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
-                       pfree(bstr);
+               memcpy(workstr, VARDATA_ANY(bstr), len);
+               Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
                 cmpstr = str_const->constvalue;
         }
         else
@@ -5933,38 +6059,160 @@ string_to_bytea_const(const char *str, size_t str_len)
   *-------------------------------------------------------------------------
   */
  
+List *
+deconstruct_indexquals(IndexPath *path)
+{
+       List       *result = NIL;
+       IndexOptInfo *index = path->indexinfo;
+       ListCell   *lcc,
+                          *lci;
+
+       forboth(lcc, path->indexquals, lci, path->indexqualcols)
+       {
+               RestrictInfo *rinfo = castNode(RestrictInfo, lfirst(lcc));
+               int                     indexcol = lfirst_int(lci);
+               Expr       *clause;
+               Node       *leftop,
+                                  *rightop;
+               IndexQualInfo *qinfo;
+
+               clause = rinfo->clause;
+
+               qinfo = (IndexQualInfo *) palloc(sizeof(IndexQualInfo));
+               qinfo->rinfo = rinfo;
+               qinfo->indexcol = indexcol;
+
+               if (IsA(clause, OpExpr))
+               {
+                       qinfo->clause_op = ((OpExpr *) clause)->opno;
+                       leftop = get_leftop(clause);
+                       rightop = get_rightop(clause);
+                       if (match_index_to_operand(leftop, indexcol, index))
+                       {
+                               qinfo->varonleft = true;
+                               qinfo->other_operand = rightop;
+                       }
+                       else
+                       {
+                               Assert(match_index_to_operand(rightop, indexcol, index));
+                               qinfo->varonleft = false;
+                               qinfo->other_operand = leftop;
+                       }
+               }
+               else if (IsA(clause, RowCompareExpr))
+               {
+                       RowCompareExpr *rc = (RowCompareExpr *) clause;
+
+                       qinfo->clause_op = linitial_oid(rc->opnos);
+                       /* Examine only first columns to determine left/right sides */
+                       if (match_index_to_operand((Node *) linitial(rc->largs),
+                                                                          indexcol, index))
+                       {
+                               qinfo->varonleft = true;
+                               qinfo->other_operand = (Node *) rc->rargs;
+                       }
+                       else
+                       {
+                               Assert(match_index_to_operand((Node *) linitial(rc->rargs),
+                                                                                         indexcol, index));
+                               qinfo->varonleft = false;
+                               qinfo->other_operand = (Node *) rc->largs;
+                       }
+               }
+               else if (IsA(clause, ScalarArrayOpExpr))
+               {
+                       ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+
+                       qinfo->clause_op = saop->opno;
+                       /* index column is always on the left in this case */
+                       Assert(match_index_to_operand((Node *) linitial(saop->args),
+                                                                                 indexcol, index));
+                       qinfo->varonleft = true;
+                       qinfo->other_operand = (Node *) lsecond(saop->args);
+               }
+               else if (IsA(clause, NullTest))
+               {
+                       qinfo->clause_op = InvalidOid;
+                       Assert(match_index_to_operand((Node *) ((NullTest *) clause)->arg,
+                                                                                 indexcol, index));
+                       qinfo->varonleft = true;
+                       qinfo->other_operand = NULL;
+               }
+               else
+               {
+                       elog(ERROR, "unsupported indexqual type: %d",
+                                (int) nodeTag(clause));
+               }
+
+               result = lappend(result, qinfo);
+       }
+       return result;
+}
+
+/*
+ * Simple function to compute the total eval cost of the "other operands"
+ * in an IndexQualInfo list.  Since we know these will be evaluated just
+ * once per scan, there's no need to distinguish startup from per-row cost.
+ */
+static Cost
+other_operands_eval_cost(PlannerInfo *root, List *qinfos)
+{
+       Cost            qual_arg_cost = 0;
+       ListCell   *lc;
+
+       foreach(lc, qinfos)
+       {
+               IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(lc);
+               QualCost        index_qual_cost;
+
+               cost_qual_eval_node(&index_qual_cost, qinfo->other_operand, root);
+               qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       }
+       return qual_arg_cost;
+}
+
  /*
- * genericcostestimate is a general-purpose estimator that can be used for
- * most index types.  In some cases we use genericcostestimate as the base
- * code and then incorporate additional index-type-specific knowledge in
- * the type-specific calling function. To avoid code duplication, we make
- * genericcostestimate return a number of intermediate values as well as
- * its preliminary estimates of the output cost values.  The GenericCosts
- * struct includes all these values.
+ * Get other-operand eval cost for an index orderby list.
   *
- * Callers should initialize all fields of GenericCosts to zero.  In addition,
- * they can set numIndexTuples to some positive value if they have a better
- * than default way of estimating the number of leaf index tuples visited.
+ * Index orderby expressions aren't represented as RestrictInfos (since they
+ * aren't boolean, usually).  So we can't apply deconstruct_indexquals to
+ * them.  However, they are much simpler to deal with since they are always
+ * OpExprs and the index column is always on the left.
   */
-typedef struct
+static Cost
+orderby_operands_eval_cost(PlannerInfo *root, IndexPath *path)
  {
-       /* These are the values the cost estimator must return to the planner */
-       Cost            indexStartupCost;               /* index-related startup cost */
-       Cost            indexTotalCost; /* total index-related scan cost */
-       Selectivity indexSelectivity;           /* selectivity of index */
-       double          indexCorrelation;               /* order correlation of index */
-
-       /* Intermediate values we obtain along the way */
-       double          numIndexPages;  /* number of leaf pages visited */
-       double          numIndexTuples; /* number of leaf tuples visited */
-       double          spc_random_page_cost;   /* relevant random_page_cost value */
-       double          num_sa_scans;   /* # indexscans from ScalarArrayOps */
-} GenericCosts;
+       Cost            qual_arg_cost = 0;
+       ListCell   *lc;
  
-static void
+       foreach(lc, path->indexorderbys)
+       {
+               Expr       *clause = (Expr *) lfirst(lc);
+               Node       *other_operand;
+               QualCost        index_qual_cost;
+
+               if (IsA(clause, OpExpr))
+               {
+                       other_operand = get_rightop(clause);
+               }
+               else
+               {
+                       elog(ERROR, "unsupported indexorderby type: %d",
+                                (int) nodeTag(clause));
+                       other_operand = NULL;           /* keep compiler quiet */
+               }
+
+               cost_qual_eval_node(&index_qual_cost, other_operand, root);
+               qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       }
+       return qual_arg_cost;
+}
+
+void
  genericcostestimate(PlannerInfo *root,
                                         IndexPath *path,
                                         double loop_count,
+                                       List *qinfos,
                                         GenericCosts *costs)
  {
         IndexOptInfo *index = path->indexinfo;
@@ -5980,7 +6228,6 @@ genericcostestimate(PlannerInfo *root,
         double          num_sa_scans;
         double          num_outer_scans;
         double          num_scans;
-       QualCost        index_qual_cost;
         double          qual_op_cost;
         double          qual_arg_cost;
         List       *selectivityQuals;
@@ -6057,7 +6304,7 @@ genericcostestimate(PlannerInfo *root,
          *
          * In practice access to upper index levels is often nearly free because
          * those tend to stay in cache under load; moreover, the cost involved is
-        * highly dependent on index type.      We therefore ignore such costs here
+        * highly dependent on index type.  We therefore ignore such costs here
          * and leave it to the caller to add a suitable charge if needed.
          */
         if (index->pages > 1 && index->tuples > 1)
@@ -6065,7 +6312,7 @@ genericcostestimate(PlannerInfo *root,
         else
                 numIndexPages = 1.0;
  
-       /* fetch estimated page cost for schema containing index */
+       /* fetch estimated page cost for tablespace containing index */
         get_tablespace_page_costs(index->reltablespace,
                                                           &spc_random_page_cost,
                                                           NULL);
@@ -6076,9 +6323,9 @@ genericcostestimate(PlannerInfo *root,
          * The above calculations are all per-index-scan.  However, if we are in a
          * nestloop inner scan, we can expect the scan to be repeated (with
          * different search keys) for each row of the outer relation.  Likewise,
-        * ScalarArrayOpExpr quals result in multiple index scans.      This creates
+        * ScalarArrayOpExpr quals result in multiple index scans.  This creates
          * the potential for cache effects to reduce the number of disk page
-        * fetches needed.      We want to estimate the average per-scan I/O cost in
+        * fetches needed.  We want to estimate the average per-scan I/O cost in
          * the presence of caching.
          *
          * We use the Mackert-Lohman formula (see costsize.c for details) to
@@ -6125,7 +6372,7 @@ genericcostestimate(PlannerInfo *root,
          * evaluated once at the start of the scan to reduce them to runtime keys
          * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
          * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
-        * indexqual operator.  Because we have numIndexTuples as a per-scan
+        * indexqual operator.  Because we have numIndexTuples as a per-scan
          * number, we have to multiply by num_sa_scans to get the correct result
          * for ScalarArrayOpExpr cases.  Similarly add in costs for any index
          * ORDER BY expressions.
@@ -6134,15 +6381,10 @@ genericcostestimate(PlannerInfo *root,
          * Detecting that that might be needed seems more expensive than it's
          * worth, though, considering all the other inaccuracies here ...
          */
-       cost_qual_eval(&index_qual_cost, indexQuals, root);
-       qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
-       cost_qual_eval(&index_qual_cost, indexOrderBys, root);
-       qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       qual_arg_cost = other_operands_eval_cost(root, qinfos) +
+               orderby_operands_eval_cost(root, path);
         qual_op_cost = cpu_operator_cost *
                 (list_length(indexQuals) + list_length(indexOrderBys));
-       qual_arg_cost -= qual_op_cost;
-       if (qual_arg_cost < 0)          /* just in case... */
-               qual_arg_cost = 0;
  
         indexStartupCost = qual_arg_cost;
         indexTotalCost += qual_arg_cost;
@@ -6172,16 +6414,16 @@ genericcostestimate(PlannerInfo *root,
   * ANDing the index predicate with the explicitly given indexquals produces
   * a more accurate idea of the index's selectivity.  However, we need to be
   * careful not to insert redundant clauses, because clauselist_selectivity()
- * is easily fooled into computing a too-low selectivity estimate.     Our
+ * is easily fooled into computing a too-low selectivity estimate.  Our
   * approach is to add only the predicate clause(s) that cannot be proven to
- * be implied by the given indexquals. This successfully handles cases such
+ * be implied by the given indexquals.  This successfully handles cases such
   * as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
   * There are many other cases where we won't detect redundancy, leading to a
   * too-low selectivity estimate, which will bias the system in favor of using
- * partial indexes where possible.     That is not necessarily bad though.
+ * partial indexes where possible.  That is not necessarily bad though.
   *
   * Note that indexQuals contains RestrictInfo nodes while the indpred
- * does not, so the output list will be mixed. This is OK for both
+ * does not, so the output list will be mixed.  This is OK for both
   * predicate_implied_by() and clauselist_selectivity(), but might be
   * problematic if the result were passed to other things.
   */
@@ -6207,17 +6449,14 @@ add_predicate_to_quals(IndexOptInfo *index, List *indexQuals)
  }
  
  
-Datum
-btcostestimate(PG_FUNCTION_ARGS)
+void
+btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                          Cost *indexStartupCost, Cost *indexTotalCost,
+                          Selectivity *indexSelectivity, double *indexCorrelation,
+                          double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       double          loop_count = PG_GETARG_FLOAT8(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
         IndexOptInfo *index = path->indexinfo;
+       List       *qinfos;
         GenericCosts costs;
         Oid                     relid;
         AttrNumber      colnum;
@@ -6230,8 +6469,10 @@ btcostestimate(PG_FUNCTION_ARGS)
         bool            found_saop;
         bool            found_is_null_op;
         double          num_sa_scans;
-       ListCell   *lcc,
-                          *lci;
+       ListCell   *lc;
+
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
  
         /*
          * For a btree scan, only leading '=' quals plus inequality quals for the
@@ -6240,7 +6481,7 @@ btcostestimate(PG_FUNCTION_ARGS)
          * the index scan).  Additional quals can suppress visits to the heap, so
          * it's OK to count them in indexSelectivity, but they should not count
          * for estimating numIndexTuples.  So we must examine the given indexquals
-        * to find out which ones count as boundary quals.      We rely on the
+        * to find out which ones count as boundary quals.  We rely on the
          * knowledge that they are given in index column order.
          *
          * For a RowCompareExpr, we consider only the first column, just as
@@ -6256,83 +6497,52 @@ btcostestimate(PG_FUNCTION_ARGS)
         found_saop = false;
         found_is_null_op = false;
         num_sa_scans = 1;
-       forboth(lcc, path->indexquals, lci, path->indexqualcols)
+       foreach(lc, qinfos)
         {
-               RestrictInfo *rinfo = (RestrictInfo *) lfirst(lcc);
-               Expr       *clause;
-               Node       *leftop,
-                                  *rightop PG_USED_FOR_ASSERTS_ONLY;
+               IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(lc);
+               RestrictInfo *rinfo = qinfo->rinfo;
+               Expr       *clause = rinfo->clause;
                 Oid                     clause_op;
                 int                     op_strategy;
-               bool            is_null_op = false;
  
-               if (indexcol != lfirst_int(lci))
+               if (indexcol != qinfo->indexcol)
                 {
                         /* Beginning of a new column's quals */
                         if (!eqQualHere)
                                 break;                  /* done if no '=' qual for indexcol */
                         eqQualHere = false;
                         indexcol++;
-                       if (indexcol != lfirst_int(lci))
+                       if (indexcol != qinfo->indexcol)
                                 break;                  /* no quals at all for indexcol */
                 }
  
-               Assert(IsA(rinfo, RestrictInfo));
-               clause = rinfo->clause;
-
-               if (IsA(clause, OpExpr))
-               {
-                       leftop = get_leftop(clause);
-                       rightop = get_rightop(clause);
-                       clause_op = ((OpExpr *) clause)->opno;
-               }
-               else if (IsA(clause, RowCompareExpr))
-               {
-                       RowCompareExpr *rc = (RowCompareExpr *) clause;
-
-                       leftop = (Node *) linitial(rc->largs);
-                       rightop = (Node *) linitial(rc->rargs);
-                       clause_op = linitial_oid(rc->opnos);
-               }
-               else if (IsA(clause, ScalarArrayOpExpr))
+               if (IsA(clause, ScalarArrayOpExpr))
                 {
-                       ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+                       int                     alength = estimate_array_length(qinfo->other_operand);
  
-                       leftop = (Node *) linitial(saop->args);
-                       rightop = (Node *) lsecond(saop->args);
-                       clause_op = saop->opno;
                         found_saop = true;
+                       /* count up number of SA scans induced by indexBoundQuals only */
+                       if (alength > 1)
+                               num_sa_scans *= alength;
                 }
                 else if (IsA(clause, NullTest))
                 {
                         NullTest   *nt = (NullTest *) clause;
  
-                       leftop = (Node *) nt->arg;
-                       rightop = NULL;
-                       clause_op = InvalidOid;
                         if (nt->nulltesttype == IS_NULL)
                         {
                                 found_is_null_op = true;
-                               is_null_op = true;
+                               /* IS NULL is like = for selectivity determination purposes */
+                               eqQualHere = true;
                         }
                 }
-               else
-               {
-                       elog(ERROR, "unsupported indexqual type: %d",
-                                (int) nodeTag(clause));
-                       continue;                       /* keep compiler quiet */
-               }
  
-               if (match_index_to_operand(leftop, indexcol, index))
-               {
-                       /* clause_op is correct */
-               }
-               else
-               {
-                       Assert(match_index_to_operand(rightop, indexcol, index));
-                       /* Must flip operator to get the opfamily member */
-                       clause_op = get_commutator(clause_op);
-               }
+               /*
+                * We would need to commute the clause_op if not varonleft, except
+                * that we only care if it's equality or not, so that refinement is
+                * unnecessary.
+                */
+               clause_op = qinfo->clause_op;
  
                 /* check for equality operator */
                 if (OidIsValid(clause_op))
@@ -6343,20 +6553,7 @@ btcostestimate(PG_FUNCTION_ARGS)
                         if (op_strategy == BTEqualStrategyNumber)
                                 eqQualHere = true;
                 }
-               else if (is_null_op)
-               {
-                       /* IS NULL is like = for purposes of selectivity determination */
-                       eqQualHere = true;
-               }
-               /* count up number of SA scans induced by indexBoundQuals only */
-               if (IsA(clause, ScalarArrayOpExpr))
-               {
-                       ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
-                       int                     alength = estimate_array_length(lsecond(saop->args));
  
-                       if (alength > 1)
-                               num_sa_scans *= alength;
-               }
                 indexBoundQuals = lappend(indexBoundQuals, rinfo);
         }
  
@@ -6404,7 +6601,7 @@ btcostestimate(PG_FUNCTION_ARGS)
         MemSet(&costs, 0, sizeof(costs));
         costs.numIndexTuples = numIndexTuples;
  
-       genericcostestimate(root, path, loop_count, &costs);
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
  
         /*
          * Add a CPU-cost component to represent the costs of initial btree
@@ -6546,25 +6743,24 @@ btcostestimate(PG_FUNCTION_ARGS)
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
-
-       PG_RETURN_VOID();
+       *indexPages = costs.numIndexPages;
  }
  
-Datum
-hashcostestimate(PG_FUNCTION_ARGS)
+void
+hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                                Cost *indexStartupCost, Cost *indexTotalCost,
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       double          loop_count = PG_GETARG_FLOAT8(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+       List       *qinfos;
         GenericCosts costs;
  
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
         MemSet(&costs, 0, sizeof(costs));
  
-       genericcostestimate(root, path, loop_count, &costs);
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
  
         /*
          * A hash index has no descent costs as such, since the index AM can go
@@ -6579,7 +6775,7 @@ hashcostestimate(PG_FUNCTION_ARGS)
          * because the hash AM makes sure that's always one page.
          *
          * Likewise, we could consider charging some CPU for each index tuple in
-        * the bucket, if we knew how many there were.  But the per-tuple cost is
+        * the bucket, if we knew how many there were.  But the per-tuple cost is
          * just a hash value comparison, not a general datatype-dependent
          * comparison, so any such charge ought to be quite a bit less than
          * cpu_operator_cost; which makes it probably not worth worrying about.
@@ -6595,27 +6791,26 @@ hashcostestimate(PG_FUNCTION_ARGS)
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
-
-       PG_RETURN_VOID();
+       *indexPages = costs.numIndexPages;
  }
  
-Datum
-gistcostestimate(PG_FUNCTION_ARGS)
+void
+gistcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                                Cost *indexStartupCost, Cost *indexTotalCost,
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       double          loop_count = PG_GETARG_FLOAT8(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
         IndexOptInfo *index = path->indexinfo;
+       List       *qinfos;
         GenericCosts costs;
         Cost            descentCost;
  
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
         MemSet(&costs, 0, sizeof(costs));
  
-       genericcostestimate(root, path, loop_count, &costs);
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
  
         /*
          * We model index descent costs similarly to those for btree, but to do
@@ -6637,7 +6832,7 @@ gistcostestimate(PG_FUNCTION_ARGS)
         /*
          * Add a CPU-cost component to represent the costs of initial descent. We
          * just use log(N) here not log2(N) since the branching factor isn't
-        * necessarily two anyway.      As for btree, charge once per SA scan.
+        * necessarily two anyway.  As for btree, charge once per SA scan.
          */
         if (index->tuples > 1)          /* avoid computing log(0) */
         {
@@ -6657,27 +6852,26 @@ gistcostestimate(PG_FUNCTION_ARGS)
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
-
-       PG_RETURN_VOID();
+       *indexPages = costs.numIndexPages;
  }
  
-Datum
-spgcostestimate(PG_FUNCTION_ARGS)
+void
+spgcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                               Cost *indexStartupCost, Cost *indexTotalCost,
+                               Selectivity *indexSelectivity, double *indexCorrelation,
+                               double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       double          loop_count = PG_GETARG_FLOAT8(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
         IndexOptInfo *index = path->indexinfo;
+       List       *qinfos;
         GenericCosts costs;
         Cost            descentCost;
  
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
         MemSet(&costs, 0, sizeof(costs));
  
-       genericcostestimate(root, path, loop_count, &costs);
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
  
         /*
          * We model index descent costs similarly to those for btree, but to do
@@ -6699,7 +6893,7 @@ spgcostestimate(PG_FUNCTION_ARGS)
         /*
          * Add a CPU-cost component to represent the costs of initial descent. We
          * just use log(N) here not log2(N) since the branching factor isn't
-        * necessarily two anyway.      As for btree, charge once per SA scan.
+        * necessarily two anyway.  As for btree, charge once per SA scan.
          */
         if (index->tuples > 1)          /* avoid computing log(0) */
         {
@@ -6719,8 +6913,7 @@ spgcostestimate(PG_FUNCTION_ARGS)
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
-
-       PG_RETURN_VOID();
+       *indexPages = costs.numIndexPages;
  }
  
  
@@ -6737,21 +6930,6 @@ typedef struct
         double          arrayScans;
  } GinQualCounts;
  
-/* Find the index column matching "op"; return its index, or -1 if no match */
-static int
-find_index_column(Node *op, IndexOptInfo *index)
-{
-       int                     i;
-
-       for (i = 0; i < index->ncolumns; i++)
-       {
-               if (match_index_to_operand(op, i, index))
-                       return i;
-       }
-
-       return -1;
-}
-
  /*
   * Estimate the number of index terms that need to be searched for while
   * testing the given GIN query, and increment the counts in *counts
@@ -6776,7 +6954,7 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
  
         /*
          * Get the operator's strategy number and declared input data types within
-        * the index opfamily.  (We don't need the latter, but we use
+        * the index opfamily.  (We don't need the latter, but we use
          * get_op_opfamily_properties because it will throw error if it fails to
          * find a matching pg_amop entry.)
          */
@@ -6860,30 +7038,20 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
   * appropriately.  If the query is unsatisfiable, return false.
   */
  static bool
-gincost_opexpr(PlannerInfo *root, IndexOptInfo *index, OpExpr *clause,
+gincost_opexpr(PlannerInfo *root,
+                          IndexOptInfo *index,
+                          IndexQualInfo *qinfo,
                            GinQualCounts *counts)
  {
-       Node       *leftop = get_leftop((Expr *) clause);
-       Node       *rightop = get_rightop((Expr *) clause);
-       Oid                     clause_op = clause->opno;
-       int                     indexcol;
-       Node       *operand;
+       int                     indexcol = qinfo->indexcol;
+       Oid                     clause_op = qinfo->clause_op;
+       Node       *operand = qinfo->other_operand;
  
-       /* Locate the operand being compared to the index column */
-       if ((indexcol = find_index_column(leftop, index)) >= 0)
-       {
-               operand = rightop;
-       }
-       else if ((indexcol = find_index_column(rightop, index)) >= 0)
+       if (!qinfo->varonleft)
         {
-               operand = leftop;
+               /* must commute the operator */
                 clause_op = get_commutator(clause_op);
         }
-       else
-       {
-               elog(ERROR, "could not match index to operand");
-               operand = NULL;                 /* keep compiler quiet */
-       }
  
         /* aggressively reduce to a constant, and look through relabeling */
         operand = estimate_expression_value(root, operand);
@@ -6922,19 +7090,19 @@ gincost_opexpr(PlannerInfo *root, IndexOptInfo *index, OpExpr *clause,
   * each of which involves one value from the RHS array, plus all the
   * non-array quals (if any).  To model this, we average the counts across
   * the RHS elements, and add the averages to the counts in *counts (which
- * correspond to per-indexscan costs). We also multiply counts->arrayScans
+ * correspond to per-indexscan costs).  We also multiply counts->arrayScans
   * by N, causing gincostestimate to scale up its estimates accordingly.
   */
  static bool
  gincost_scalararrayopexpr(PlannerInfo *root,
-                                                 IndexOptInfo *index, ScalarArrayOpExpr *clause,
+                                                 IndexOptInfo *index,
+                                                 IndexQualInfo *qinfo,
                                                   double numIndexEntries,
                                                   GinQualCounts *counts)
  {
-       Node       *leftop = (Node *) linitial(clause->args);
-       Node       *rightop = (Node *) lsecond(clause->args);
-       Oid                     clause_op = clause->opno;
-       int                     indexcol;
+       int                     indexcol = qinfo->indexcol;
+       Oid                     clause_op = qinfo->clause_op;
+       Node       *rightop = qinfo->other_operand;
         ArrayType  *arrayval;
         int16           elmlen;
         bool            elmbyval;
@@ -6946,11 +7114,7 @@ gincost_scalararrayopexpr(PlannerInfo *root,
         int                     numPossible = 0;
         int                     i;
  
-       Assert(clause->useOr);
-
-       /* index column must be on the left */
-       if ((indexcol = find_index_column(leftop, index)) < 0)
-               elog(ERROR, "could not match index to operand");
+       Assert(((ScalarArrayOpExpr *) qinfo->rinfo->clause)->useOr);
  
         /* aggressively reduce to a constant, and look through relabeling */
         rightop = estimate_expression_value(root, rightop);
@@ -7044,19 +7208,16 @@ gincost_scalararrayopexpr(PlannerInfo *root,
  /*
   * GIN has search behavior completely different from other index types
   */
-Datum
-gincostestimate(PG_FUNCTION_ARGS)
+void
+gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                               Cost *indexStartupCost, Cost *indexTotalCost,
+                               Selectivity *indexSelectivity, double *indexCorrelation,
+                               double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       double          loop_count = PG_GETARG_FLOAT8(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
         IndexOptInfo *index = path->indexinfo;
         List       *indexQuals = path->indexquals;
         List       *indexOrderBys = path->indexorderbys;
+       List       *qinfos;
         ListCell   *l;
         List       *selectivityQuals;
         double          numPages = index->pages,
@@ -7067,6 +7228,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                                 numEntries;
         GinQualCounts counts;
         bool            matchPossible;
+       double          partialScale;
         double          entryPagesFetched,
                                 dataPagesFetched,
                                 dataPagesFetchedBySel;
@@ -7074,44 +7236,81 @@ gincostestimate(PG_FUNCTION_ARGS)
                                 qual_arg_cost,
                                 spc_random_page_cost,
                                 outer_scans;
-       QualCost        index_qual_cost;
         Relation        indexRel;
         GinStatsData ginStats;
  
-       /*
-        * Obtain statistic information from the meta page
-        */
-       indexRel = index_open(index->indexoid, AccessShareLock);
-       ginGetStats(indexRel, &ginStats);
-       index_close(indexRel, AccessShareLock);
-
-       numEntryPages = ginStats.nEntryPages;
-       numDataPages = ginStats.nDataPages;
-       numPendingPages = ginStats.nPendingPages;
-       numEntries = ginStats.nEntries;
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
  
         /*
-        * nPendingPages can be trusted, but the other fields are as of the last
-        * VACUUM.      Scale them by the ratio numPages / nTotalPages to account for
-        * growth since then.  If the fields are zero (implying no VACUUM at all,
-        * and an index created pre-9.1), assume all pages are entry pages.
+        * Obtain statistical information from the meta page, if possible.  Else
+        * set ginStats to zeroes, and we'll cope below.
          */
-       if (ginStats.nTotalPages == 0 || ginStats.nEntryPages == 0)
+       if (!index->hypothetical)
         {
-               numEntryPages = numPages;
-               numDataPages = 0;
-               numEntries = numTuples; /* bogus, but no other info available */
+               indexRel = index_open(index->indexoid, AccessShareLock);
+               ginGetStats(indexRel, &ginStats);
+               index_close(indexRel, AccessShareLock);
         }
         else
         {
+               memset(&ginStats, 0, sizeof(ginStats));
+       }
+
+       /*
+        * Assuming we got valid (nonzero) stats at all, nPendingPages can be
+        * trusted, but the other fields are data as of the last VACUUM.  We can
+        * scale them up to account for growth since then, but that method only
+        * goes so far; in the worst case, the stats might be for a completely
+        * empty index, and scaling them will produce pretty bogus numbers.
+        * Somewhat arbitrarily, set the cutoff for doing scaling at 4X growth; if
+        * it's grown more than that, fall back to estimating things only from the
+        * assumed-accurate index size.  But we'll trust nPendingPages in any case
+        * so long as it's not clearly insane, ie, more than the index size.
+        */
+       if (ginStats.nPendingPages < numPages)
+               numPendingPages = ginStats.nPendingPages;
+       else
+               numPendingPages = 0;
+
+       if (numPages > 0 && ginStats.nTotalPages <= numPages &&
+               ginStats.nTotalPages > numPages / 4 &&
+               ginStats.nEntryPages > 0 && ginStats.nEntries > 0)
+       {
+               /*
+                * OK, the stats seem close enough to sane to be trusted.  But we
+                * still need to scale them by the ratio numPages / nTotalPages to
+                * account for growth since the last VACUUM.
+                */
                 double          scale = numPages / ginStats.nTotalPages;
  
-               numEntryPages = ceil(numEntryPages * scale);
-               numDataPages = ceil(numDataPages * scale);
-               numEntries = ceil(numEntries * scale);
+               numEntryPages = ceil(ginStats.nEntryPages * scale);
+               numDataPages = ceil(ginStats.nDataPages * scale);
+               numEntries = ceil(ginStats.nEntries * scale);
                 /* ensure we didn't round up too much */
-               numEntryPages = Min(numEntryPages, numPages);
-               numDataPages = Min(numDataPages, numPages - numEntryPages);
+               numEntryPages = Min(numEntryPages, numPages - numPendingPages);
+               numDataPages = Min(numDataPages,
+                                                  numPages - numPendingPages - numEntryPages);
+       }
+       else
+       {
+               /*
+                * We might get here because it's a hypothetical index, or an index
+                * created pre-9.1 and never vacuumed since upgrading (in which case
+                * its stats would read as zeroes), or just because it's grown too
+                * much since the last VACUUM for us to put our faith in scaling.
+                *
+                * Invent some plausible internal statistics based on the index page
+                * count (and clamp that to at least 10 pages, just in case).  We
+                * estimate that 90% of the index is entry pages, and the rest is data
+                * pages.  Estimate 100 entries per entry page; this is rather bogus
+                * since it'll depend on the size of the keys, but it's more robust
+                * than trying to predict the number of entries per heap tuple.
+                */
+               numPages = Max(numPages, 10);
+               numEntryPages = floor((numPages - numPendingPages) * 0.90);
+               numDataPages = numPages - numPendingPages - numEntryPages;
+               numEntries = floor(numEntryPages * 100);
         }
  
         /* In an empty index, numEntries could be zero.  Avoid divide-by-zero */
@@ -7146,7 +7345,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                                                                                            JOIN_INNER,
                                                                                            NULL);
  
-       /* fetch estimated page cost for schema containing index */
+       /* fetch estimated page cost for tablespace containing index */
         get_tablespace_page_costs(index->reltablespace,
                                                           &spc_random_page_cost,
                                                           NULL);
@@ -7163,18 +7362,16 @@ gincostestimate(PG_FUNCTION_ARGS)
         counts.arrayScans = 1;
         matchPossible = true;
  
-       foreach(l, indexQuals)
+       foreach(l, qinfos)
         {
-               RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
-               Expr       *clause;
+               IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(l);
+               Expr       *clause = qinfo->rinfo->clause;
  
-               Assert(IsA(rinfo, RestrictInfo));
-               clause = rinfo->clause;
                 if (IsA(clause, OpExpr))
                 {
                         matchPossible = gincost_opexpr(root,
                                                                                    index,
-                                                                                  (OpExpr *) clause,
+                                                                                  qinfo,
                                                                                    &counts);
                         if (!matchPossible)
                                 break;
@@ -7183,7 +7380,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                 {
                         matchPossible = gincost_scalararrayopexpr(root,
                                                                                                           index,
-                                                                                               (ScalarArrayOpExpr *) clause,
+                                                                                                         qinfo,
                                                                                                           numEntries,
                                                                                                           &counts);
                         if (!matchPossible)
@@ -7203,7 +7400,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                 *indexStartupCost = 0;
                 *indexTotalCost = 0;
                 *indexSelectivity = 0;
-               PG_RETURN_VOID();
+               return;
         }
  
         if (counts.haveFullScan || indexQuals == NIL)
@@ -7237,17 +7434,22 @@ gincostestimate(PG_FUNCTION_ARGS)
  
         /*
          * Add an estimate of entry pages read by partial match algorithm. It's a
-        * scan over leaf pages in entry tree.  We haven't any useful stats here,
-        * so estimate it as proportion.
+        * scan over leaf pages in entry tree.  We haven't any useful stats here,
+        * so estimate it as proportion.  Because counts.partialEntries is really
+        * pretty bogus (see code above), it's possible that it is more than
+        * numEntries; clamp the proportion to ensure sanity.
          */
-       entryPagesFetched += ceil(numEntryPages * counts.partialEntries / numEntries);
+       partialScale = counts.partialEntries / numEntries;
+       partialScale = Min(partialScale, 1.0);
+
+       entryPagesFetched += ceil(numEntryPages * partialScale);
  
         /*
          * Partial match algorithm reads all data pages before doing actual scan,
-        * so it's a startup cost. Again, we haven't any useful stats here, so,
-        * estimate it as proportion
+        * so it's a startup cost.  Again, we haven't any useful stats here, so
+        * estimate it as proportion.
          */
-       dataPagesFetched = ceil(numDataPages * counts.partialEntries / numEntries);
+       dataPagesFetched = ceil(numDataPages * partialScale);
  
         /*
          * Calculate cache effects if more than one scan due to nestloops or array
@@ -7275,31 +7477,30 @@ gincostestimate(PG_FUNCTION_ARGS)
         *indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost;
  
         /*
-        * Now we compute the number of data pages fetched while the scan
-        * proceeds.
+        * Now compute the number of data pages fetched during the scan.
+        *
+        * We assume every entry to have the same number of items, and that there
+        * is no overlap between them. (XXX: tsvector and array opclasses collect
+        * statistics on the frequency of individual keys; it would be nice to use
+        * those here.)
          */
-
-       /* data pages scanned for each exact (non-partial) matched entry */
         dataPagesFetched = ceil(numDataPages * counts.exactEntries / numEntries);
  
         /*
-        * Estimate number of data pages read, using selectivity estimation and
-        * capacity of data page.
+        * If there is a lot of overlap among the entries, in particular if one of
+        * the entries is very frequent, the above calculation can grossly
+        * under-estimate.  As a simple cross-check, calculate a lower bound based
+        * on the overall selectivity of the quals.  At a minimum, we must read
+        * one item pointer for each matching entry.
+        *
+        * The width of each item pointer varies, based on the level of
+        * compression.  We don't have statistics on that, but an average of
+        * around 3 bytes per item is fairly typical.
          */
         dataPagesFetchedBySel = ceil(*indexSelectivity *
-                                                                (numTuples / (BLCKSZ / SizeOfIptrData)));
-
+                                                                (numTuples / (BLCKSZ / 3)));
         if (dataPagesFetchedBySel > dataPagesFetched)
-       {
-               /*
-                * At least one of entries is very frequent and, unfortunately, we
-                * couldn't get statistic about entries (only tsvector has such
-                * statistics). So, we obviously have too small estimation of pages
-                * fetched from data tree. Re-estimate it from known capacity of data
-                * pages
-                */
                 dataPagesFetched = dataPagesFetchedBySel;
-       }
  
         /* Account for cache effects, the same as above */
         if (outer_scans > 1 || counts.arrayScans > 1)
@@ -7318,19 +7519,77 @@ gincostestimate(PG_FUNCTION_ARGS)
         /*
          * Add on index qual eval costs, much as in genericcostestimate
          */
-       cost_qual_eval(&index_qual_cost, indexQuals, root);
-       qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
-       cost_qual_eval(&index_qual_cost, indexOrderBys, root);
-       qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       qual_arg_cost = other_operands_eval_cost(root, qinfos) +
+               orderby_operands_eval_cost(root, path);
+       qual_op_cost = cpu_operator_cost *
+               (list_length(indexQuals) + list_length(indexOrderBys));
+
+       *indexStartupCost += qual_arg_cost;
+       *indexTotalCost += qual_arg_cost;
+       *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+       *indexPages = dataPagesFetched;
+}
+
+/*
+ * BRIN has search behavior completely different from other index types
+ */
+void
+brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                                Cost *indexStartupCost, Cost *indexTotalCost,
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
+{
+       IndexOptInfo *index = path->indexinfo;
+       List       *indexQuals = path->indexquals;
+       List       *indexOrderBys = path->indexorderbys;
+       double          numPages = index->pages;
+       double          numTuples = index->tuples;
+       List       *qinfos;
+       Cost            spc_seq_page_cost;
+       Cost            spc_random_page_cost;
+       double          qual_op_cost;
+       double          qual_arg_cost;
+
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
+       /* fetch estimated page cost for tablespace containing index */
+       get_tablespace_page_costs(index->reltablespace,
+                                                         &spc_random_page_cost,
+                                                         &spc_seq_page_cost);
+
+       /*
+        * BRIN indexes are always read in full; use that as startup cost.
+        *
+        * XXX maybe only include revmap pages here?
+        */
+       *indexStartupCost = spc_seq_page_cost * numPages * loop_count;
+
+       /*
+        * To read a BRIN index there might be a bit of back and forth over
+        * regular pages, as revmap might point to them out of sequential order;
+        * calculate this as reading the whole index in random order.
+        */
+       *indexTotalCost = spc_random_page_cost * numPages * loop_count;
+
+       *indexSelectivity =
+               clauselist_selectivity(root, indexQuals,
+                                                          path->indexinfo->rel->relid,
+                                                          JOIN_INNER, NULL);
+       *indexCorrelation = 1;
+
+       /*
+        * Add on index qual eval costs, much as in genericcostestimate.
+        */
+       qual_arg_cost = other_operands_eval_cost(root, qinfos) +
+               orderby_operands_eval_cost(root, path);
         qual_op_cost = cpu_operator_cost *
                 (list_length(indexQuals) + list_length(indexOrderBys));
-       qual_arg_cost -= qual_op_cost;
-       if (qual_arg_cost < 0)          /* just in case... */
-               qual_arg_cost = 0;
  
         *indexStartupCost += qual_arg_cost;
         *indexTotalCost += qual_arg_cost;
         *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+       *indexPages = index->pages;
  
-       PG_RETURN_VOID();
+       /* XXX what about pages_per_range? */
  }