Add support for EUI-64 MAC addresses as macaddr8

[postgresql] / src / backend / utils / adt / selfuncs.c
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index da638f885aa5c381220f0e53688dd48d52cd86bf..bb9a5446861bde72e1caee147ef7e0c382b8de16 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -7,10 +7,10 @@
   *       Selectivity routines are registered in the pg_operator catalog
   *       in the "oprrest" and "oprjoin" attributes.
   *
- *       Index cost functions are registered in the pg_am catalog
- *       in the "amcostestimate" attribute.
+ *       Index cost functions are located via the index AM's API struct,
+ *       which is obtained from the handler function registered in pg_am.
   *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@ -72,7 +72,7 @@
   *             float8 oprjoin (internal, oid, internal, int2, internal);
   *
   * (Before Postgres 8.4, join estimators had only the first four of these
- * parameters. That signature is still allowed, but deprecated.)  The
+ * parameters.  That signature is still allowed, but deprecated.)  The
   * relationship between jointype and sjinfo is explained in the comments for
   * clause_selectivity() --- the short version is that jointype is usually
   * best ignored in favor of examining sjinfo.
@@ -83,18 +83,31 @@
   * joins, however, the selectivity is defined as the fraction of the left-hand
   * side relation's rows that are expected to have a match (ie, at least one
   * row with a TRUE result) in the right-hand side.
+ *
+ * For both oprrest and oprjoin functions, the operator's input collation OID
+ * (if any) is passed using the standard fmgr mechanism, so that the estimator
+ * function can fetch it with PG_GET_COLLATION().  Note, however, that all
+ * statistics in pg_statistic are currently built using the database's default
+ * collation.  Thus, in most cases where we are looking at statistics, we
+ * should ignore the actual operator collation and use DEFAULT_COLLATION_OID.
+ * We expect that the error induced by doing this is usually not large enough
+ * to justify complicating matters.
   *----------
   */
  
  #include "postgres.h"
  
  #include <ctype.h>
+#include <float.h>
  #include <math.h>
  
  #include "access/gin.h"
+#include "access/htup_details.h"
  #include "access/sysattr.h"
  #include "catalog/index.h"
+#include "catalog/pg_am.h"
  #include "catalog/pg_collation.h"
+#include "catalog/pg_operator.h"
  #include "catalog/pg_opfamily.h"
  #include "catalog/pg_statistic.h"
  #include "catalog/pg_type.h"
@@ -110,6 +123,7 @@
  #include "optimizer/predtest.h"
  #include "optimizer/restrictinfo.h"
  #include "optimizer/var.h"
+#include "parser/parse_clause.h"
  #include "parser/parse_coerce.h"
  #include "parser/parsetree.h"
  #include "utils/builtins.h"
@@ -117,6 +131,7 @@
  #include "utils/date.h"
  #include "utils/datum.h"
  #include "utils/fmgroids.h"
+#include "utils/index_selfuncs.h"
  #include "utils/lsyscache.h"
  #include "utils/nabstime.h"
  #include "utils/pg_locale.h"
@@ -126,6 +141,8 @@
  #include "utils/syscache.h"
  #include "utils/timestamp.h"
  #include "utils/tqual.h"
+#include "utils/typcache.h"
+#include "utils/varlena.h"
  
  
  /* Hooks for plugins to get control when we ask for stats */
@@ -181,10 +198,15 @@ static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
  static Selectivity prefix_selectivity(PlannerInfo *root,
                                    VariableStatData *vardata,
                                    Oid vartype, Oid opfamily, Const *prefixcon);
-static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
+static Selectivity like_selectivity(const char *patt, int pattlen,
+                                bool case_insensitive);
+static Selectivity regex_selectivity(const char *patt, int pattlen,
+                                 bool case_insensitive,
+                                 int fixed_prefix_len);
  static Datum string_to_datum(const char *str, Oid datatype);
  static Const *string_to_const(const char *str, Oid datatype);
  static Const *string_to_bytea_const(const char *str, size_t str_len);
+static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
  
  
  /*
@@ -192,7 +214,7 @@ static Const *string_to_bytea_const(const char *str, size_t str_len);
   *
   * Note: this routine is also used to estimate selectivity for some
   * operators that are not "=" but have comparable selectivity behavior,
- * such as "~=" (geometric approximate-match). Even for "=", we must
+ * such as "~=" (geometric approximate-match).  Even for "=", we must
   * keep in mind that the left and right datatypes may differ.
   */
  Datum
@@ -255,10 +277,11 @@ var_eq_const(VariableStatData *vardata, Oid operator,
                 return 0.0;
  
         /*
-        * If we matched the var to a unique index, assume there is exactly one
-        * match regardless of anything else.  (This is slightly bogus, since the
-        * index's equality operator might be different from ours, but it's more
-        * likely to be right than ignoring the information.)
+        * If we matched the var to a unique index or DISTINCT clause, assume
+        * there is exactly one match regardless of anything else.  (This is
+        * slightly bogus, since the index or clause's equality operator might be
+        * different from ours, but it's much more likely to be right than
+        * ignoring the information.)
          */
         if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
                 return 1.0 / vardata->rel->tuples;
@@ -278,7 +301,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
                 /*
                  * Is the constant "=" to any of the column's most common values?
                  * (Although the given operator may not really be "=", we will assume
-                * that seeing whether it returns TRUE is an appropriate test.  If you
+                * that seeing whether it returns TRUE is an appropriate test.  If you
                  * don't like this, maybe you shouldn't be using eqsel for your
                  * operator...)
                  */
@@ -389,10 +412,11 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
         bool            isdefault;
  
         /*
-        * If we matched the var to a unique index, assume there is exactly one
-        * match regardless of anything else.  (This is slightly bogus, since the
-        * index's equality operator might be different from ours, but it's more
-        * likely to be right than ignoring the information.)
+        * If we matched the var to a unique index or DISTINCT clause, assume
+        * there is exactly one match regardless of anything else.  (This is
+        * slightly bogus, since the index or clause's equality operator might be
+        * different from ours, but it's much more likely to be right than
+        * ignoring the information.)
          */
         if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
                 return 1.0 / vardata->rel->tuples;
@@ -413,7 +437,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
                  * result averaged over all possible values whether common or
                  * uncommon.  (Essentially, we are assuming that the not-yet-known
                  * comparison value is equally likely to be any of the possible
-                * values, regardless of their frequency in the table.  Is that a good
+                * values, regardless of their frequency in the table.  Is that a good
                  * idea?)
                  */
                 selec = 1.0 - stats->stanullfrac;
@@ -636,7 +660,7 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
   * essentially using the histogram just as a representative sample.  However,
   * small histograms are unlikely to be all that representative, so the caller
   * should be prepared to fall back on some other estimation approach when the
- * histogram is missing or very small. It may also be prudent to combine this
+ * histogram is missing or very small.  It may also be prudent to combine this
   * approach with another one when the histogram is small.
   *
   * If the actual histogram size is not at least min_hist_size, we won't bother
@@ -654,7 +678,7 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
   *
   * Note that the result disregards both the most-common-values (if any) and
   * null entries.  The caller is expected to combine this result with
- * statistics for those portions of the column population.     It may also be
+ * statistics for those portions of the column population.  It may also be
   * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
   */
  double
@@ -767,7 +791,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
                          *
                          * If the binary search accesses the first or last histogram
                          * entry, we try to replace that endpoint with the true column min
-                        * or max as found by get_actual_variable_range().      This
+                        * or max as found by get_actual_variable_range().  This
                          * ameliorates misestimates when the min or max is moving as a
                          * result of changes since the last ANALYZE.  Note that this could
                          * result in effectively including MCVs into the histogram that
@@ -871,7 +895,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
  
                                                 /*
                                                  * Watch out for the possibility that we got a NaN or
-                                                * Infinity from the division.  This can happen
+                                                * Infinity from the division.  This can happen
                                                  * despite the previous checks, if for example "low"
                                                  * is -Infinity.
                                                  */
@@ -886,7 +910,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
                                          * Ideally we'd produce an error here, on the grounds that
                                          * the given operator shouldn't have scalarXXsel
                                          * registered as its selectivity func unless we can deal
-                                        * with its operand types.      But currently, all manner of
+                                        * with its operand types.  But currently, all manner of
                                          * stuff is invoking scalarXXsel, so give a default
                                          * estimate until that can be fixed.
                                          */
@@ -912,7 +936,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
  
                         /*
                          * The histogram boundaries are only approximate to begin with,
-                        * and may well be out of date anyway.  Therefore, don't believe
+                        * and may well be out of date anyway.  Therefore, don't believe
                          * extremely small or large selectivity estimates --- unless we
                          * got actual current endpoint values from the table.
                          */
@@ -1093,6 +1117,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
         Oid                     operator = PG_GETARG_OID(1);
         List       *args = (List *) PG_GETARG_POINTER(2);
         int                     varRelid = PG_GETARG_INT32(3);
+       Oid                     collation = PG_GET_COLLATION();
         VariableStatData vardata;
         Node       *other;
         bool            varonleft;
@@ -1101,14 +1126,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
         Oid                     vartype;
         Oid                     opfamily;
         Pattern_Prefix_Status pstatus;
-       Const      *patt = NULL;
+       Const      *patt;
         Const      *prefix = NULL;
-       Const      *rest = NULL;
+       Selectivity rest_selec = 0;
         double          result;
  
         /*
          * If this is for a NOT LIKE or similar operator, get the corresponding
-        * positive-match operator and work with that.  Set result to the correct
+        * positive-match operator and work with that.  Set result to the correct
          * default estimate, too.
          */
         if (negate)
@@ -1193,17 +1218,20 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
         }
  
         /*
-        * Divide pattern into fixed prefix and remainder.      XXX we have to assume
-        * default collation here, because we don't have access to the actual
-        * input collation for the operator.  FIXME ...
+        * Pull out any fixed prefix implied by the pattern, and estimate the
+        * fractional selectivity of the remainder of the pattern.  Unlike many of
+        * the other functions in this file, we use the pattern operator's actual
+        * collation for this step.  This is not because we expect the collation
+        * to make a big difference in the selectivity estimate (it seldom would),
+        * but because we want to be sure we cache compiled regexps under the
+        * right cache key, so that they can be re-used at runtime.
          */
         patt = (Const *) other;
-       pstatus = pattern_fixed_prefix(patt, ptype, DEFAULT_COLLATION_OID,
-                                                                  &prefix, &rest);
+       pstatus = pattern_fixed_prefix(patt, ptype, collation,
+                                                                  &prefix, &rest_selec);
  
         /*
-        * If necessary, coerce the prefix constant to the right type. (The "rest"
-        * constant need not be changed.)
+        * If necessary, coerce the prefix constant to the right type.
          */
         if (prefix && prefix->consttype != vartype)
         {
@@ -1277,15 +1305,13 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
                 {
                         Selectivity heursel;
                         Selectivity prefixsel;
-                       Selectivity restsel;
  
                         if (pstatus == Pattern_Prefix_Partial)
                                 prefixsel = prefix_selectivity(root, &vardata, vartype,
                                                                                            opfamily, prefix);
                         else
                                 prefixsel = 1.0;
-                       restsel = pattern_selectivity(rest, ptype);
-                       heursel = prefixsel * restsel;
+                       heursel = prefixsel * rest_selec;
  
                         if (selec < 0)          /* fewer than 10 histogram entries? */
                                 selec = heursel;
@@ -1311,7 +1337,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
                 /*
                  * If we have most-common-values info, add up the fractions of the MCV
                  * entries that satisfy MCV OP PATTERN.  These fractions contribute
-                * directly to the result selectivity.  Also add up the total fraction
+                * directly to the result selectivity.  Also add up the total fraction
                  * represented by MCV entries.
                  */
                 mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
@@ -1418,6 +1444,50 @@ icnlikesel(PG_FUNCTION_ARGS)
         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
  }
  
+/*
+ *             boolvarsel              - Selectivity of Boolean variable.
+ *
+ * This can actually be called on any boolean-valued expression.  If it
+ * involves only Vars of the specified relation, and if there are statistics
+ * about the Var or expression (the latter is possible if it's indexed) then
+ * we'll produce a real estimate; otherwise it's just a default.
+ */
+Selectivity
+boolvarsel(PlannerInfo *root, Node *arg, int varRelid)
+{
+       VariableStatData vardata;
+       double          selec;
+
+       examine_variable(root, arg, varRelid, &vardata);
+       if (HeapTupleIsValid(vardata.statsTuple))
+       {
+               /*
+                * A boolean variable V is equivalent to the clause V = 't', so we
+                * compute the selectivity as if that is what we have.
+                */
+               selec = var_eq_const(&vardata, BooleanEqualOperator,
+                                                        BoolGetDatum(true), false, true);
+       }
+       else if (is_funcclause(arg))
+       {
+               /*
+                * If we have no stats and it's a function call, estimate 0.3333333.
+                * This seems a pretty unprincipled choice, but Postgres has been
+                * using that estimate for function calls since 1992.  The hoariness
+                * of this behavior suggests that we should not be in too much hurry
+                * to use another value.
+                */
+               selec = 0.3333333;
+       }
+       else
+       {
+               /* Otherwise, the default estimate is 0.5 */
+               selec = 0.5;
+       }
+       ReleaseVariableStats(vardata);
+       return selec;
+}
+
  /*
   *             booltestsel             - Selectivity of BooleanTest Node.
   */
@@ -1508,31 +1578,29 @@ booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
                         /*
                          * No most-common-value info available. Still have null fraction
                          * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
-                        * for null fraction and assume an even split for boolean tests.
+                        * for null fraction and assume a 50-50 split of TRUE and FALSE.
                          */
                         switch (booltesttype)
                         {
                                 case IS_UNKNOWN:
-
-                                       /*
-                                        * Use freq_null directly.
-                                        */
+                                       /* select only NULL values */
                                         selec = freq_null;
                                         break;
                                 case IS_NOT_UNKNOWN:
-
-                                       /*
-                                        * Select not unknown (not null) values. Calculate from
-                                        * freq_null.
-                                        */
+                                       /* select non-NULL values */
                                         selec = 1.0 - freq_null;
                                         break;
                                 case IS_TRUE:
-                               case IS_NOT_TRUE:
                                 case IS_FALSE:
-                               case IS_NOT_FALSE:
+                                       /* Assume we select half of the non-NULL values */
                                         selec = (1.0 - freq_null) / 2.0;
                                         break;
+                               case IS_NOT_TRUE:
+                               case IS_NOT_FALSE:
+                                       /* Assume we select NULLs plus half of the non-NULLs */
+                                       /* equiv. to freq_null + (1.0 - freq_null) / 2.0 */
+                                       selec = (freq_null + 1.0) / 2.0;
+                                       break;
                                 default:
                                         elog(ERROR, "unrecognized booltesttype: %d",
                                                  (int) booltesttype);
@@ -1698,31 +1766,27 @@ scalararraysel(PlannerInfo *root,
  {
         Oid                     operator = clause->opno;
         bool            useOr = clause->useOr;
+       bool            isEquality = false;
+       bool            isInequality = false;
         Node       *leftop;
         Node       *rightop;
         Oid                     nominal_element_type;
         Oid                     nominal_element_collation;
+       TypeCacheEntry *typentry;
         RegProcedure oprsel;
         FmgrInfo        oprselproc;
         Selectivity s1;
+       Selectivity s1disjoint;
  
-       /*
-        * First, look up the underlying operator's selectivity estimator. Punt if
-        * it hasn't got one.
-        */
-       if (is_join_clause)
-               oprsel = get_oprjoin(operator);
-       else
-               oprsel = get_oprrest(operator);
-       if (!oprsel)
-               return (Selectivity) 0.5;
-       fmgr_info(oprsel, &oprselproc);
-
-       /* deconstruct the expression */
+       /* First, deconstruct the expression */
         Assert(list_length(clause->args) == 2);
         leftop = (Node *) linitial(clause->args);
         rightop = (Node *) lsecond(clause->args);
  
+       /* aggressively reduce both sides to constants */
+       leftop = estimate_expression_value(root, leftop);
+       rightop = estimate_expression_value(root, rightop);
+
         /* get nominal (after relabeling) element type of rightop */
         nominal_element_type = get_base_element_type(exprType(rightop));
         if (!OidIsValid(nominal_element_type))
@@ -1733,6 +1797,59 @@ scalararraysel(PlannerInfo *root,
         /* look through any binary-compatible relabeling of rightop */
         rightop = strip_array_coercion(rightop);
  
+       /*
+        * Detect whether the operator is the default equality or inequality
+        * operator of the array element type.
+        */
+       typentry = lookup_type_cache(nominal_element_type, TYPECACHE_EQ_OPR);
+       if (OidIsValid(typentry->eq_opr))
+       {
+               if (operator == typentry->eq_opr)
+                       isEquality = true;
+               else if (get_negator(operator) == typentry->eq_opr)
+                       isInequality = true;
+       }
+
+       /*
+        * If it is equality or inequality, we might be able to estimate this as a
+        * form of array containment; for instance "const = ANY(column)" can be
+        * treated as "ARRAY[const] <@ column".  scalararraysel_containment tries
+        * that, and returns the selectivity estimate if successful, or -1 if not.
+        */
+       if ((isEquality || isInequality) && !is_join_clause)
+       {
+               s1 = scalararraysel_containment(root, leftop, rightop,
+                                                                               nominal_element_type,
+                                                                               isEquality, useOr, varRelid);
+               if (s1 >= 0.0)
+                       return s1;
+       }
+
+       /*
+        * Look up the underlying operator's selectivity estimator. Punt if it
+        * hasn't got one.
+        */
+       if (is_join_clause)
+               oprsel = get_oprjoin(operator);
+       else
+               oprsel = get_oprrest(operator);
+       if (!oprsel)
+               return (Selectivity) 0.5;
+       fmgr_info(oprsel, &oprselproc);
+
+       /*
+        * In the array-containment check above, we must only believe that an
+        * operator is equality or inequality if it is the default btree equality
+        * operator (or its negator) for the element type, since those are the
+        * operators that array containment will use.  But in what follows, we can
+        * be a little laxer, and also believe that any operators using eqsel() or
+        * neqsel() as selectivity estimator act like equality or inequality.
+        */
+       if (oprsel == F_EQSEL || oprsel == F_EQJOINSEL)
+               isEquality = true;
+       else if (oprsel == F_NEQSEL || oprsel == F_NEQJOINSEL)
+               isInequality = true;
+
         /*
          * We consider three cases:
          *
@@ -1767,7 +1884,23 @@ scalararraysel(PlannerInfo *root,
                                                   ARR_ELEMTYPE(arrayval),
                                                   elmlen, elmbyval, elmalign,
                                                   &elem_values, &elem_nulls, &num_elems);
-               s1 = useOr ? 0.0 : 1.0;
+
+               /*
+                * For generic operators, we assume the probability of success is
+                * independent for each array element.  But for "= ANY" or "<> ALL",
+                * if the array elements are distinct (which'd typically be the case)
+                * then the probabilities are disjoint, and we should just sum them.
+                *
+                * If we were being really tense we would try to confirm that the
+                * elements are all distinct, but that would be expensive and it
+                * doesn't seem to be worth the cycles; it would amount to penalizing
+                * well-written queries in favor of poorly-written ones.  However, we
+                * do protect ourselves a little bit by checking whether the
+                * disjointness assumption leads to an impossible (out of range)
+                * probability; if so, we fall back to the normal calculation.
+                */
+               s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
                 for (i = 0; i < num_elems; i++)
                 {
                         List       *args;
@@ -1782,23 +1915,39 @@ scalararraysel(PlannerInfo *root,
                                                                                 elem_nulls[i],
                                                                                 elmbyval));
                         if (is_join_clause)
-                               s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
-                                                                                                 PointerGetDatum(root),
+                               s2 = DatumGetFloat8(FunctionCall5Coll(&oprselproc,
+                                                                                                         clause->inputcollid,
+                                                                                                         PointerGetDatum(root),
                                                                                                   ObjectIdGetDatum(operator),
-                                                                                                 PointerGetDatum(args),
-                                                                                                 Int16GetDatum(jointype),
-                                                                                                 PointerGetDatum(sjinfo)));
+                                                                                                         PointerGetDatum(args),
+                                                                                                         Int16GetDatum(jointype),
+                                                                                                  PointerGetDatum(sjinfo)));
                         else
-                               s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
-                                                                                                 PointerGetDatum(root),
+                               s2 = DatumGetFloat8(FunctionCall4Coll(&oprselproc,
+                                                                                                         clause->inputcollid,
+                                                                                                         PointerGetDatum(root),
                                                                                                   ObjectIdGetDatum(operator),
-                                                                                                 PointerGetDatum(args),
-                                                                                                 Int32GetDatum(varRelid)));
+                                                                                                         PointerGetDatum(args),
+                                                                                                  Int32GetDatum(varRelid)));
+
                         if (useOr)
+                       {
                                 s1 = s1 + s2 - s1 * s2;
+                               if (isEquality)
+                                       s1disjoint += s2;
+                       }
                         else
+                       {
                                 s1 = s1 * s2;
+                               if (isInequality)
+                                       s1disjoint += s2 - 1.0;
+                       }
                 }
+
+               /* accept disjoint-probability estimate if in range */
+               if ((useOr ? isEquality : isInequality) &&
+                       s1disjoint >= 0.0 && s1disjoint <= 1.0)
+                       s1 = s1disjoint;
         }
         else if (rightop && IsA(rightop, ArrayExpr) &&
                          !((ArrayExpr *) rightop)->multidims)
@@ -1810,7 +1959,16 @@ scalararraysel(PlannerInfo *root,
  
                 get_typlenbyval(arrayexpr->element_typeid,
                                                 &elmlen, &elmbyval);
-               s1 = useOr ? 0.0 : 1.0;
+
+               /*
+                * We use the assumption of disjoint probabilities here too, although
+                * the odds of equal array elements are rather higher if the elements
+                * are not all constants (which they won't be, else constant folding
+                * would have reduced the ArrayExpr to a Const).  In this path it's
+                * critical to have the sanity check on the s1disjoint estimate.
+                */
+               s1 = s1disjoint = (useOr ? 0.0 : 1.0);
+
                 foreach(l, arrayexpr->elements)
                 {
                         Node       *elem = (Node *) lfirst(l);
@@ -1824,23 +1982,39 @@ scalararraysel(PlannerInfo *root,
                          */
                         args = list_make2(leftop, elem);
                         if (is_join_clause)
-                               s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
-                                                                                                 PointerGetDatum(root),
+                               s2 = DatumGetFloat8(FunctionCall5Coll(&oprselproc,
+                                                                                                         clause->inputcollid,
+                                                                                                         PointerGetDatum(root),
                                                                                                   ObjectIdGetDatum(operator),
-                                                                                                 PointerGetDatum(args),
-                                                                                                 Int16GetDatum(jointype),
-                                                                                                 PointerGetDatum(sjinfo)));
+                                                                                                         PointerGetDatum(args),
+                                                                                                         Int16GetDatum(jointype),
+                                                                                                  PointerGetDatum(sjinfo)));
                         else
-                               s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
-                                                                                                 PointerGetDatum(root),
+                               s2 = DatumGetFloat8(FunctionCall4Coll(&oprselproc,
+                                                                                                         clause->inputcollid,
+                                                                                                         PointerGetDatum(root),
                                                                                                   ObjectIdGetDatum(operator),
-                                                                                                 PointerGetDatum(args),
-                                                                                                 Int32GetDatum(varRelid)));
+                                                                                                         PointerGetDatum(args),
+                                                                                                  Int32GetDatum(varRelid)));
+
                         if (useOr)
+                       {
                                 s1 = s1 + s2 - s1 * s2;
+                               if (isEquality)
+                                       s1disjoint += s2;
+                       }
                         else
+                       {
                                 s1 = s1 * s2;
+                               if (isInequality)
+                                       s1disjoint += s2 - 1.0;
+                       }
                 }
+
+               /* accept disjoint-probability estimate if in range */
+               if ((useOr ? isEquality : isInequality) &&
+                       s1disjoint >= 0.0 && s1disjoint <= 1.0)
+                       s1 = s1disjoint;
         }
         else
         {
@@ -1860,23 +2034,26 @@ scalararraysel(PlannerInfo *root,
                 dummyexpr->collation = clause->inputcollid;
                 args = list_make2(leftop, dummyexpr);
                 if (is_join_clause)
-                       s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
-                                                                                         PointerGetDatum(root),
-                                                                                         ObjectIdGetDatum(operator),
-                                                                                         PointerGetDatum(args),
-                                                                                         Int16GetDatum(jointype),
-                                                                                         PointerGetDatum(sjinfo)));
+                       s2 = DatumGetFloat8(FunctionCall5Coll(&oprselproc,
+                                                                                                 clause->inputcollid,
+                                                                                                 PointerGetDatum(root),
+                                                                                                 ObjectIdGetDatum(operator),
+                                                                                                 PointerGetDatum(args),
+                                                                                                 Int16GetDatum(jointype),
+                                                                                                 PointerGetDatum(sjinfo)));
                 else
-                       s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
-                                                                                         PointerGetDatum(root),
-                                                                                         ObjectIdGetDatum(operator),
-                                                                                         PointerGetDatum(args),
-                                                                                         Int32GetDatum(varRelid)));
+                       s2 = DatumGetFloat8(FunctionCall4Coll(&oprselproc,
+                                                                                                 clause->inputcollid,
+                                                                                                 PointerGetDatum(root),
+                                                                                                 ObjectIdGetDatum(operator),
+                                                                                                 PointerGetDatum(args),
+                                                                                                 Int32GetDatum(varRelid)));
                 s1 = useOr ? 0.0 : 1.0;
  
                 /*
                  * Arbitrarily assume 10 elements in the eventual array value (see
-                * also estimate_array_length)
+                * also estimate_array_length).  We don't risk an assumption of
+                * disjoint probabilities here.
                  */
                 for (i = 0; i < 10; i++)
                 {
@@ -1943,6 +2120,7 @@ rowcomparesel(PlannerInfo *root,
  {
         Selectivity s1;
         Oid                     opno = linitial_oid(clause->opnos);
+       Oid                     inputcollid = linitial_oid(clause->inputcollids);
         List       *opargs;
         bool            is_join_clause;
  
@@ -1983,6 +2161,7 @@ rowcomparesel(PlannerInfo *root,
                 /* Estimate selectivity for a join clause. */
                 s1 = join_selectivity(root, opno,
                                                           opargs,
+                                                         inputcollid,
                                                           jointype,
                                                           sjinfo);
         }
@@ -1991,6 +2170,7 @@ rowcomparesel(PlannerInfo *root,
                 /* Estimate selectivity for a restriction clause. */
                 s1 = restriction_selectivity(root, opno,
                                                                          opargs,
+                                                                        inputcollid,
                                                                          varRelid);
         }
  
@@ -2029,6 +2209,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
                         break;
                 case JOIN_SEMI:
                 case JOIN_ANTI:
+
                         /*
                          * Look up the join's inner relation.  min_righthand is sufficient
                          * information because neither SEMI nor ANTI joins permit any
@@ -2121,9 +2302,9 @@ eqjoinsel_inner(Oid operator,
         if (have_mcvs1 && have_mcvs2)
         {
                 /*
-                * We have most-common-value lists for both relations.  Run through
+                * We have most-common-value lists for both relations.  Run through
                  * the lists to see which MCVs actually join to each other with the
-                * given operator.      This allows us to determine the exact join
+                * given operator.  This allows us to determine the exact join
                  * selectivity for the portion of the relations represented by the MCV
                  * lists.  We still have to estimate for the remaining population, but
                  * in a skewed distribution this gives us a big leg up in accuracy.
@@ -2155,7 +2336,7 @@ eqjoinsel_inner(Oid operator,
  
                 /*
                  * Note we assume that each MCV will match at most one member of the
-                * other MCV list.      If the operator isn't really equality, there could
+                * other MCV list.  If the operator isn't really equality, there could
                  * be multiple matches --- but we don't look for them, both for speed
                  * and because the math wouldn't add up...
                  */
@@ -2332,10 +2513,24 @@ eqjoinsel_semi(Oid operator,
          * We can apply this clamping both with respect to the base relation from
          * which the join variable comes (if there is just one), and to the
          * immediate inner input relation of the current join.
+        *
+        * If we clamp, we can treat nd2 as being a non-default estimate; it's not
+        * great, maybe, but it didn't come out of nowhere either.  This is most
+        * helpful when the inner relation is empty and consequently has no stats.
          */
         if (vardata2->rel)
-               nd2 = Min(nd2, vardata2->rel->rows);
-       nd2 = Min(nd2, inner_rel->rows);
+       {
+               if (nd2 >= vardata2->rel->rows)
+               {
+                       nd2 = vardata2->rel->rows;
+                       isdefault2 = false;
+               }
+       }
+       if (nd2 >= inner_rel->rows)
+       {
+               nd2 = inner_rel->rows;
+               isdefault2 = false;
+       }
  
         if (HeapTupleIsValid(vardata1->statsTuple))
         {
@@ -2365,9 +2560,9 @@ eqjoinsel_semi(Oid operator,
         if (have_mcvs1 && have_mcvs2 && OidIsValid(operator))
         {
                 /*
-                * We have most-common-value lists for both relations.  Run through
+                * We have most-common-value lists for both relations.  Run through
                  * the lists to see which MCVs actually join to each other with the
-                * given operator.      This allows us to determine the exact join
+                * given operator.  This allows us to determine the exact join
                  * selectivity for the portion of the relations represented by the MCV
                  * lists.  We still have to estimate for the remaining population, but
                  * in a skewed distribution this gives us a big leg up in accuracy.
@@ -2398,7 +2593,7 @@ eqjoinsel_semi(Oid operator,
  
                 /*
                  * Note we assume that each MCV will match at most one member of the
-                * other MCV list.      If the operator isn't really equality, there could
+                * other MCV list.  If the operator isn't really equality, there could
                  * be multiple matches --- but we don't look for them, both for speed
                  * and because the math wouldn't add up...
                  */
@@ -2435,7 +2630,7 @@ eqjoinsel_semi(Oid operator,
  
                 /*
                  * Now we need to estimate the fraction of relation 1 that has at
-                * least one join partner.      We know for certain that the matched MCVs
+                * least one join partner.  We know for certain that the matched MCVs
                  * do, so that gives us a lower bound, but we're really in the dark
                  * about everything else.  Our crude approach is: if nd1 <= nd2 then
                  * assume all non-null rel1 rows have join partners, else assume for
@@ -3026,6 +3221,8 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   *     groupExprs - list of expressions being grouped by
   *     input_rows - number of rows estimated to arrive at the group/unique
   *             filter step
+ *     pgset - NULL, or a List** pointing to a grouping set to filter the
+ *             groupExprs against
   *
   * Given the lack of any cross-correlation statistics in the system, it's
   * impossible to do anything really trustworthy with GROUP BY conditions
@@ -3033,11 +3230,11 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   * case (all possible cross-product terms actually appear as groups) since
   * very often the grouped-by Vars are highly correlated.  Our current approach
   * is as follows:
- *     1.      Expressions yielding boolean are assumed to contribute two groups,
+ *     1.  Expressions yielding boolean are assumed to contribute two groups,
   *             independently of their content, and are ignored in the subsequent
- *             steps.  This is mainly because tests like "col IS NULL" break the
+ *             steps.  This is mainly because tests like "col IS NULL" break the
   *             heuristic used in step 2 especially badly.
- *     2.      Reduce the given expressions to a list of unique Vars used.  For
+ *     2.  Reduce the given expressions to a list of unique Vars used.  For
   *             example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
   *             It is clearly correct not to count the same Var more than once.
   *             It is also reasonable to treat f(x) the same as x: f() cannot
@@ -3047,25 +3244,25 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   *             As a special case, if a GROUP BY expression can be matched to an
   *             expressional index for which we have statistics, then we treat the
   *             whole expression as though it were just a Var.
- *     3.      If the list contains Vars of different relations that are known equal
+ *     3.  If the list contains Vars of different relations that are known equal
   *             due to equivalence classes, then drop all but one of the Vars from each
   *             known-equal set, keeping the one with smallest estimated # of values
   *             (since the extra values of the others can't appear in joined rows).
   *             Note the reason we only consider Vars of different relations is that
   *             if we considered ones of the same rel, we'd be double-counting the
   *             restriction selectivity of the equality in the next step.
- *     4.      For Vars within a single source rel, we multiply together the numbers
+ *     4.  For Vars within a single source rel, we multiply together the numbers
   *             of values, clamp to the number of rows in the rel (divided by 10 if
- *             more than one Var), and then multiply by the selectivity of the
- *             restriction clauses for that rel.  When there's more than one Var,
- *             the initial product is probably too high (it's the worst case) but
- *             clamping to a fraction of the rel's rows seems to be a helpful
- *             heuristic for not letting the estimate get out of hand.  (The factor
- *             of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
- *             by the restriction selectivity is effectively assuming that the
- *             restriction clauses are independent of the grouping, which is a crummy
- *             assumption, but it's hard to do better.
- *     5.      If there are Vars from multiple rels, we repeat step 4 for each such
+ *             more than one Var), and then multiply by a factor based on the
+ *             selectivity of the restriction clauses for that rel.  When there's
+ *             more than one Var, the initial product is probably too high (it's the
+ *             worst case) but clamping to a fraction of the rel's rows seems to be a
+ *             helpful heuristic for not letting the estimate get out of hand.  (The
+ *             factor of 10 is derived from pre-Postgres-7.4 practice.)  The factor
+ *             we multiply by to adjust for the restriction selectivity assumes that
+ *             the restriction clauses are independent of the grouping, which may not
+ *             be a valid assumption, but it's hard to do better.
+ *     5.  If there are Vars from multiple rels, we repeat step 4 for each such
   *             rel, and multiply the results together.
   * Note that rels not containing grouped Vars are ignored completely, as are
   * join clauses.  Such rels cannot increase the number of groups, and we
@@ -3073,17 +3270,32 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   * but we don't have the info to do better).
   */
  double
-estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
+estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
+                                       List **pgset)
  {
         List       *varinfos = NIL;
         double          numdistinct;
         ListCell   *l;
+       int                     i;
  
-       /* We should not be called unless query has GROUP BY (or DISTINCT) */
-       Assert(groupExprs != NIL);
+       /*
+        * We don't ever want to return an estimate of zero groups, as that tends
+        * to lead to division-by-zero and other unpleasantness.  The input_rows
+        * estimate is usually already at least 1, but clamp it just in case it
+        * isn't.
+        */
+       input_rows = clamp_row_est(input_rows);
+
+       /*
+        * If no grouping columns, there's exactly one group.  (This can't happen
+        * for normal cases with GROUP BY or DISTINCT, but it is possible for
+        * corner cases with set operations.)
+        */
+       if (groupExprs == NIL || (pgset && list_length(*pgset) < 1))
+               return 1.0;
  
         /*
-        * Count groups derived from boolean grouping expressions.      For other
+        * Count groups derived from boolean grouping expressions.  For other
          * expressions, find the unique Vars used, treating an expression as a Var
          * if we can find stats for it.  For each one, record the statistical
          * estimate of number of distinct values (total in its table, without
@@ -3091,6 +3303,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
          */
         numdistinct = 1.0;
  
+       i = 0;
         foreach(l, groupExprs)
         {
                 Node       *groupexpr = (Node *) lfirst(l);
@@ -3098,6 +3311,10 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
                 List       *varshere;
                 ListCell   *l2;
  
+               /* is expression in this grouping set? */
+               if (pgset && !list_member_int(*pgset, i++))
+                       continue;
+
                 /* Short-circuit for expressions returning boolean */
                 if (exprType(groupexpr) == BOOLOID)
                 {
@@ -3127,7 +3344,8 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
                  * down to ignoring the possible addition of nulls to the result set).
                  */
                 varshere = pull_var_clause(groupexpr,
-                                                                  PVC_RECURSE_AGGREGATES,
+                                                                  PVC_RECURSE_AGGREGATES |
+                                                                  PVC_RECURSE_WINDOWFUNCS |
                                                                    PVC_RECURSE_PLACEHOLDERS);
  
                 /*
@@ -3172,7 +3390,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
          * Group Vars by relation and estimate total numdistinct.
          *
          * For each iteration of the outer loop, we process the frontmost Var in
-        * varinfos, plus all other Vars in the same relation.  We remove these
+        * varinfos, plus all other Vars in the same relation.  We remove these
          * Vars from the newvarinfos list for the next iteration. This is the
          * easiest way to group Vars of same rel together.
          */
@@ -3237,9 +3455,51 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
                                 reldistinct = clamp;
  
                         /*
-                        * Multiply by restriction selectivity.
+                        * Update the estimate based on the restriction selectivity,
+                        * guarding against division by zero when reldistinct is zero.
+                        * Also skip this if we know that we are returning all rows.
                          */
-                       reldistinct *= rel->rows / rel->tuples;
+                       if (reldistinct > 0 && rel->rows < rel->tuples)
+                       {
+                               /*
+                                * Given a table containing N rows with n distinct values in a
+                                * uniform distribution, if we select p rows at random then
+                                * the expected number of distinct values selected is
+                                *
+                                * n * (1 - product((N-N/n-i)/(N-i), i=0..p-1))
+                                *
+                                * = n * (1 - (N-N/n)! / (N-N/n-p)! * (N-p)! / N!)
+                                *
+                                * See "Approximating block accesses in database
+                                * organizations", S. B. Yao, Communications of the ACM,
+                                * Volume 20 Issue 4, April 1977 Pages 260-261.
+                                *
+                                * Alternatively, re-arranging the terms from the factorials,
+                                * this may be written as
+                                *
+                                * n * (1 - product((N-p-i)/(N-i), i=0..N/n-1))
+                                *
+                                * This form of the formula is more efficient to compute in
+                                * the common case where p is larger than N/n.  Additionally,
+                                * as pointed out by Dell'Era, if i << N for all terms in the
+                                * product, it can be approximated by
+                                *
+                                * n * (1 - ((N-p)/N)^(N/n))
+                                *
+                                * See "Expected distinct values when selecting from a bag
+                                * without replacement", Alberto Dell'Era,
+                                * http://www.adellera.it/investigations/distinct_balls/.
+                                *
+                                * The condition i << N is equivalent to n >> 1, so this is a
+                                * good approximation when the number of distinct values in
+                                * the table is large.  It turns out that this formula also
+                                * works well even when n is small.
+                                */
+                               reldistinct *=
+                                       (1 - pow((rel->tuples - rel->rows) / rel->tuples,
+                                                        rel->tuples / reldistinct));
+                       }
+                       reldistinct = clamp_row_est(reldistinct);
  
                         /*
                          * Update estimate of total distinct groups.
@@ -3273,11 +3533,11 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
   * distribution, so this will have to do for now.
   *
   * We are passed the number of buckets the executor will use for the given
- * input relation.     If the data were perfectly distributed, with the same
+ * input relation.  If the data were perfectly distributed, with the same
   * number of tuples going into each available bucket, then the bucketsize
   * fraction would be 1/nbuckets.  But this happy state of affairs will occur
   * only if (a) there are at least nbuckets distinct data values, and (b)
- * we have a not-too-skewed data distribution. Otherwise the buckets will
+ * we have a not-too-skewed data distribution.  Otherwise the buckets will
   * be nonuniformly occupied.  If the other relation in the join has a key
   * distribution similar to this one's, then the most-loaded buckets are
   * exactly those that will be probed most often.  Therefore, the "average"
@@ -3338,8 +3598,11 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
          * XXX Possibly better way, but much more expensive: multiply by
          * selectivity of rel's restriction clauses that mention the target Var.
          */
-       if (vardata.rel)
+       if (vardata.rel && vardata.rel->tuples > 0)
+       {
                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
+               ndistinct = clamp_row_est(ndistinct);
+       }
  
         /*
          * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
@@ -3450,7 +3713,7 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
          * operators to estimate selectivity for the other's.  This is outright
          * wrong in some cases --- in particular signed versus unsigned
          * interpretation could trip us up.  But it's useful enough in the
-        * majority of cases that we do it anyway.      Should think about more
+        * majority of cases that we do it anyway.  Should think about more
          * rigorous ways to do it.
          */
         switch (valuetypid)
@@ -3474,6 +3737,8 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                 case REGTYPEOID:
                 case REGCONFIGOID:
                 case REGDICTIONARYOID:
+               case REGROLEOID:
+               case REGNAMESPACEOID:
                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
@@ -3535,6 +3800,7 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                 case INETOID:
                 case CIDROID:
                 case MACADDROID:
+               case MACADDR8OID:
                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
@@ -3579,6 +3845,8 @@ convert_numeric_to_scalar(Datum value, Oid typid)
                 case REGTYPEOID:
                 case REGCONFIGOID:
                 case REGDICTIONARYOID:
+               case REGROLEOID:
+               case REGNAMESPACEOID:
                         /* we can treat OIDs as integers... */
                         return (double) DatumGetObjectId(value);
         }
@@ -3703,10 +3971,16 @@ convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
                 return 0.0;                             /* empty string has scalar value 0 */
  
         /*
-        * Since base is at least 10, need not consider more than about 20 chars
+        * There seems little point in considering more than a dozen bytes from
+        * the string.  Since base is at least 10, that will give us nominal
+        * resolution of at least 12 decimal digits, which is surely far more
+        * precision than this estimation technique has got anyway (especially in
+        * non-C locales).  Also, even with the maximum possible base of 256, this
+        * ensures denom cannot grow larger than 256^13 = 2.03e31, which will not
+        * overflow on any known machine.
          */
-       if (slen > 20)
-               slen = 20;
+       if (slen > 12)
+               slen = 12;
  
         /* Convert initial characters to fraction */
         base = rangehi - rangelo + 1;
@@ -3771,19 +4045,11 @@ convert_string_datum(Datum value, Oid typid)
         {
                 char       *xfrmstr;
                 size_t          xfrmlen;
-               size_t          xfrmlen2;
+               size_t xfrmlen2 PG_USED_FOR_ASSERTS_ONLY;
  
                 /*
-                * Note: originally we guessed at a suitable output buffer size, and
-                * only needed to call strxfrm twice if our guess was too small.
-                * However, it seems that some versions of Solaris have buggy strxfrm
-                * that can write past the specified buffer length in that scenario.
-                * So, do it the dumb way for portability.
-                *
-                * Yet other systems (e.g., glibc) sometimes return a smaller value
-                * from the second call than the first; thus the Assert must be <= not
-                * == as you'd expect.  Can't any of these people program their way
-                * out of a paper bag?
+                * XXX: We could guess at a suitable output buffer size and only call
+                * strxfrm twice if our guess is too small.
                  *
                  * XXX: strxfrm doesn't support UTF-8 encoding on Win32, it can return
                  * bogus data or set an error. This is not really a problem unless it
@@ -3816,6 +4082,11 @@ convert_string_datum(Datum value, Oid typid)
  #endif
                 xfrmstr = (char *) palloc(xfrmlen + 1);
                 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
+
+               /*
+                * Some systems (e.g., glibc) can return a smaller value from the
+                * second call than the first; thus the Assert must be <= not ==.
+                */
                 Assert(xfrmlen2 <= xfrmlen);
                 pfree(val);
                 val = xfrmstr;
@@ -3943,31 +4214,17 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
                                  * average month length of 365.25/12.0 days.  Not too
                                  * accurate, but plenty good enough for our purposes.
                                  */
-#ifdef HAVE_INT64_TIMESTAMP
                                 return interval->time + interval->day * (double) USECS_PER_DAY +
                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
-#else
-                               return interval->time + interval->day * SECS_PER_DAY +
-                                       interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
-#endif
                         }
                 case RELTIMEOID:
-#ifdef HAVE_INT64_TIMESTAMP
                         return (DatumGetRelativeTime(value) * 1000000.0);
-#else
-                       return DatumGetRelativeTime(value);
-#endif
                 case TINTERVALOID:
                         {
                                 TimeInterval tinterval = DatumGetTimeInterval(value);
  
-#ifdef HAVE_INT64_TIMESTAMP
                                 if (tinterval->status != 0)
                                         return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
-#else
-                               if (tinterval->status != 0)
-                                       return tinterval->data[1] - tinterval->data[0];
-#endif
                                 return 0;               /* for lack of a better idea */
                         }
                 case TIMEOID:
@@ -3977,11 +4234,7 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
  
                                 /* use GMT-equivalent time */
-#ifdef HAVE_INT64_TIMESTAMP
                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
-#else
-                               return (double) (timetz->time + timetz->zone);
-#endif
                         }
         }
  
@@ -4034,7 +4287,7 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
         right = (Node *) lsecond(args);
  
         /*
-        * Examine both sides.  Note that when varRelid is nonzero, Vars of other
+        * Examine both sides.  Note that when varRelid is nonzero, Vars of other
          * relations will be treated as pseudoconstants.
          */
         examine_variable(root, left, varRelid, vardata);
@@ -4060,7 +4313,7 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
                 return true;
         }
  
-       /* Ooops, clause has wrong structure (probably var op var) */
+       /* Oops, clause has wrong structure (probably var op var) */
         ReleaseVariableStats(*vardata);
         ReleaseVariableStats(rdata);
  
@@ -4128,10 +4381,11 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
   *     atttype, atttypmod: type data to pass to get_attstatsslot().  This is
   *             commonly the same as the exposed type of the variable argument,
   *             but can be different in binary-compatible-type cases.
- *     isunique: TRUE if we were able to match the var to a unique index,
- *             implying its values are unique for this query.  (Caution: this
- *             should be trusted for statistical purposes only, since we do not
- *             check indimmediate.)
+ *     isunique: TRUE if we were able to match the var to a unique index or a
+ *             single-column DISTINCT clause, implying its values are unique for
+ *             this query.  (Caution: this should be trusted for statistical
+ *             purposes only, since we do not check indimmediate nor verify that
+ *             the exact same definition of equality applies.)
   *
   * Caller is responsible for doing ReleaseVariableStats() before exiting.
   */
@@ -4178,7 +4432,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
  
         /*
          * Okay, it's a more complicated expression.  Determine variable
-        * membership.  Note that when varRelid isn't zero, only vars of that
+        * membership.  Note that when varRelid isn't zero, only vars of that
          * relation are considered "real" vars.
          */
         varnos = pull_varnos(basenode);
@@ -4227,13 +4481,13 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
         if (onerel)
         {
                 /*
-                * We have an expression in vars of a single relation.  Try to match
+                * We have an expression in vars of a single relation.  Try to match
                  * it to expressional index columns, in hopes of finding some
                  * statistics.
                  *
                  * XXX it's conceivable that there are multiple matches with different
                  * index opfamilies; if so, we need to pick one that matches the
-                * operator we are estimating for.      FIXME later.
+                * operator we are estimating for.  FIXME later.
                  */
                 ListCell   *ilist;
  
@@ -4357,32 +4611,27 @@ examine_simple_variable(PlannerInfo *root, Var *var,
         {
                 /*
                  * Plain subquery (not one that was converted to an appendrel).
-                *
-                * Punt if subquery uses set operations, GROUP BY, or DISTINCT --- any
-                * of these will mash underlying columns' stats beyond recognition.
-                * (Set ops are particularly nasty; if we forged ahead, we would
-                * return stats relevant to only the leftmost subselect...)
                  */
                 Query      *subquery = rte->subquery;
                 RelOptInfo *rel;
                 TargetEntry *ste;
  
-               if (subquery->setOperations ||
-                       subquery->groupClause ||
-                       subquery->distinctClause)
+               /*
+                * Punt if it's a whole-row var rather than a plain column reference.
+                */
+               if (var->varattno == InvalidAttrNumber)
                         return;
  
                 /*
-                * If the sub-query originated from a view with the security_barrier
-                * attribute, we treat it as a black-box from outside of the view.
-                * This is probably a harsher restriction than necessary; it's
-                * certainly OK for the selectivity estimator (which is a C function,
-                * and therefore omnipotent anyway) to look at the statistics.  But
-                * many selectivity estimators will happily *invoke the operator
-                * function* to try to work out a good estimate - and that's not OK.
-                * So for now, we do this.
+                * Punt if subquery uses set operations or GROUP BY, as these will
+                * mash underlying columns' stats beyond recognition.  (Set ops are
+                * particularly nasty; if we forged ahead, we would return stats
+                * relevant to only the leftmost subselect...)  DISTINCT is also
+                * problematic, but we check that later because there is a possibility
+                * of learning something even with it.
                  */
-               if (rte->security_barrier)
+               if (subquery->setOperations ||
+                       subquery->groupClause)
                         return;
  
                 /*
@@ -4394,16 +4643,18 @@ examine_simple_variable(PlannerInfo *root, Var *var,
                  */
                 rel = find_base_rel(root, var->varno);
  
-               /* Subquery should have been planned already */
-               Assert(rel->subroot && IsA(rel->subroot, PlannerInfo));
+               /* If the subquery hasn't been planned yet, we have to punt */
+               if (rel->subroot == NULL)
+                       return;
+               Assert(IsA(rel->subroot, PlannerInfo));
  
                 /*
-                * Switch our attention to the subquery as mangled by the planner.
-                * It was okay to look at the pre-planning version for the tests
-                * above, but now we need a Var that will refer to the subroot's
-                * live RelOptInfos.  For instance, if any subquery pullup happened
-                * during planning, Vars in the targetlist might have gotten replaced,
-                * and we need to see the replacement expressions.
+                * Switch our attention to the subquery as mangled by the planner. It
+                * was okay to look at the pre-planning version for the tests above,
+                * but now we need a Var that will refer to the subroot's live
+                * RelOptInfos.  For instance, if any subquery pullup happened during
+                * planning, Vars in the targetlist might have gotten replaced, and we
+                * need to see the replacement expressions.
                  */
                 subquery = rel->subroot->parse;
                 Assert(IsA(subquery, Query));
@@ -4415,6 +4666,37 @@ examine_simple_variable(PlannerInfo *root, Var *var,
                                  rte->eref->aliasname, var->varattno);
                 var = (Var *) ste->expr;
  
+               /*
+                * If subquery uses DISTINCT, we can't make use of any stats for the
+                * variable ... but, if it's the only DISTINCT column, we are entitled
+                * to consider it unique.  We do the test this way so that it works
+                * for cases involving DISTINCT ON.
+                */
+               if (subquery->distinctClause)
+               {
+                       if (list_length(subquery->distinctClause) == 1 &&
+                               targetIsInSortList(ste, InvalidOid, subquery->distinctClause))
+                               vardata->isunique = true;
+                       /* cannot go further */
+                       return;
+               }
+
+               /*
+                * If the sub-query originated from a view with the security_barrier
+                * attribute, we must not look at the variable's statistics, though it
+                * seems all right to notice the existence of a DISTINCT clause. So
+                * stop here.
+                *
+                * This is probably a harsher restriction than necessary; it's
+                * certainly OK for the selectivity estimator (which is a C function,
+                * and therefore omnipotent anyway) to look at the statistics.  But
+                * many selectivity estimators will happily *invoke the operator
+                * function* to try to work out a good estimate - and that's not OK.
+                * So for now, don't dig down for stats.
+                */
+               if (rte->security_barrier)
+                       return;
+
                 /* Can only handle a simple Var of subquery's query level */
                 if (var && IsA(var, Var) &&
                         var->varlevelsup == 0)
@@ -4434,7 +4716,7 @@ examine_simple_variable(PlannerInfo *root, Var *var,
                 /*
                  * Otherwise, the Var comes from a FUNCTION, VALUES, or CTE RTE.  (We
                  * won't see RTE_JOIN here because join alias Vars have already been
-                * flattened.)  There's not much we can do with function outputs, but
+                * flattened.)  There's not much we can do with function outputs, but
                  * maybe someday try to be smarter about VALUES and/or CTEs.
                  */
         }
@@ -4448,21 +4730,23 @@ examine_simple_variable(PlannerInfo *root, Var *var,
   * *isdefault: set to TRUE if the result is a default rather than based on
   * anything meaningful.
   *
- * NB: be careful to produce an integral result, since callers may compare
- * the result to exact integer counts.
+ * NB: be careful to produce a positive integral result, since callers may
+ * compare the result to exact integer counts, or might divide by it.
   */
  double
  get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  {
         double          stadistinct;
+       double          stanullfrac = 0.0;
         double          ntuples;
  
         *isdefault = false;
  
         /*
-        * Determine the stadistinct value to use.      There are cases where we can
+        * Determine the stadistinct value to use.  There are cases where we can
          * get an estimate even without a pg_statistic entry, or can get a better
-        * value than is in pg_statistic.
+        * value than is in pg_statistic.  Grab stanullfrac too if we can find it
+        * (otherwise, assume no nulls, for lack of any better idea).
          */
         if (HeapTupleIsValid(vardata->statsTuple))
         {
@@ -4471,6 +4755,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  
                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
                 stadistinct = stats->stadistinct;
+               stanullfrac = stats->stanullfrac;
         }
         else if (vardata->vartype == BOOLOID)
         {
@@ -4494,7 +4779,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
                         {
                                 case ObjectIdAttributeNumber:
                                 case SelfItemPointerAttributeNumber:
-                                       stadistinct = -1.0; /* unique */
+                                       stadistinct = -1.0; /* unique (and all non null) */
                                         break;
                                 case TableOidAttributeNumber:
                                         stadistinct = 1.0;      /* only 1 value */
@@ -4513,19 +4798,20 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
         }
  
         /*
-        * If there is a unique index for the variable, assume it is unique no
-        * matter what pg_statistic says; the statistics could be out of date, or
-        * we might have found a partial unique index that proves the var is
-        * unique for this query.
+        * If there is a unique index or DISTINCT clause for the variable, assume
+        * it is unique no matter what pg_statistic says; the statistics could be
+        * out of date, or we might have found a partial unique index that proves
+        * the var is unique for this query.  However, we'd better still believe
+        * the null-fraction statistic.
          */
         if (vardata->isunique)
-               stadistinct = -1.0;
+               stadistinct = -1.0 * (1.0 - stanullfrac);
  
         /*
          * If we had an absolute estimate, use that.
          */
         if (stadistinct > 0.0)
-               return stadistinct;
+               return clamp_row_est(stadistinct);
  
         /*
          * Otherwise we need to get the relation size; punt if not available.
@@ -4546,15 +4832,15 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
          * If we had a relative estimate, use that.
          */
         if (stadistinct < 0.0)
-               return floor((-stadistinct * ntuples) + 0.5);
+               return clamp_row_est(-stadistinct * ntuples);
  
         /*
          * With no data, estimate ndistinct = ntuples if the table is small, else
-        * use default.  We use DEFAULT_NUM_DISTINCT as the cutoff for "small"
-        * so that the behavior isn't discontinuous.
+        * use default.  We use DEFAULT_NUM_DISTINCT as the cutoff for "small" so
+        * that the behavior isn't discontinuous.
          */
         if (ntuples < DEFAULT_NUM_DISTINCT)
-               return ntuples;
+               return clamp_row_est(ntuples);
  
         *isdefault = true;
         return DEFAULT_NUM_DISTINCT;
@@ -4584,7 +4870,7 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop,
  
         /*
          * XXX It's very tempting to try to use the actual column min and max, if
-        * we can get them relatively-cheaply with an index probe.      However, since
+        * we can get them relatively-cheaply with an index probe.  However, since
          * this function is called many times during join planning, that could
          * have unpleasant effects on planning speed.  Need more investigation
          * before enabling this.
@@ -4787,6 +5073,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         HeapTuple       tup;
                         Datum           values[INDEX_MAX_KEYS];
                         bool            isnull[INDEX_MAX_KEYS];
+                       SnapshotData SnapshotDirty;
  
                         estate = CreateExecutorState();
                         econtext = GetPerTupleExprContext(estate);
@@ -4809,6 +5096,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel));
                         econtext->ecxt_scantuple = slot;
                         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
+                       InitDirtySnapshot(SnapshotDirty);
  
                         /* set up an IS NOT NULL scan key so that we ignore nulls */
                         ScanKeyEntryInitialize(&scankeys[0],
@@ -4825,7 +5113,22 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         /* If min is requested ... */
                         if (min)
                         {
-                               index_scan = index_beginscan(heapRel, indexRel, SnapshotNow,
+                               /*
+                                * In principle, we should scan the index with our current
+                                * active snapshot, which is the best approximation we've got
+                                * to what the query will see when executed.  But that won't
+                                * be exact if a new snap is taken before running the query,
+                                * and it can be very expensive if a lot of uncommitted rows
+                                * exist at the end of the index (because we'll laboriously
+                                * fetch each one and reject it).  What seems like a good
+                                * compromise is to use SnapshotDirty.  That will accept
+                                * uncommitted rows, and thus avoid fetching multiple heap
+                                * tuples in this scenario.  On the other hand, it will reject
+                                * known-dead rows, and thus not give a bogus answer when the
+                                * extreme value has been deleted; that case motivates not
+                                * using SnapshotAny here.
+                                */
+                               index_scan = index_beginscan(heapRel, indexRel, &SnapshotDirty,
                                                                                          1, 0);
                                 index_rescan(index_scan, scankeys, 1, NULL, 0);
  
@@ -4857,7 +5160,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
                         /* If max is requested, and we didn't find the index is empty */
                         if (max && have_data)
                         {
-                               index_scan = index_beginscan(heapRel, indexRel, SnapshotNow,
+                               index_scan = index_beginscan(heapRel, indexRel, &SnapshotDirty,
                                                                                          1, 0);
                                 index_rescan(index_scan, scankeys, 1, NULL, 0);
  
@@ -4957,7 +5260,7 @@ find_join_input_rel(PlannerInfo *root, Relids relids)
   * Check whether char is a letter (and, hence, subject to case-folding)
   *
   * In multibyte character sets, we can't use isalpha, and it does not seem
- * worth trying to convert to wchar_t to use iswalpha. Instead, just assume
+ * worth trying to convert to wchar_t to use iswalpha.  Instead, just assume
   * any multibyte char is potentially case-varying.
   */
  static int
@@ -4981,9 +5284,9 @@ pattern_char_isalpha(char c, bool is_multibyte,
   *
   * *prefix is set to a palloc'd prefix string (in the form of a Const node),
   *     or to NULL if no fixed prefix exists for the pattern.
- * *rest is set to a palloc'd Const representing the remainder of the pattern
- *     after the portion describing the fixed prefix.
- * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
+ * If rest_selec is not NULL, *rest_selec is set to an estimate of the
+ *     selectivity of the remainder of the pattern (without any fixed prefix).
+ * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
   *
   * The return value distinguishes no fixed prefix, a partial prefix,
   * or an exact-match-only pattern.
@@ -4991,12 +5294,11 @@ pattern_char_isalpha(char c, bool is_multibyte,
  
  static Pattern_Prefix_Status
  like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
-                                 Const **prefix_const, Const **rest_const)
+                                 Const **prefix_const, Selectivity *rest_selec)
  {
         char       *match;
         char       *patt;
         int                     pattlen;
-       char       *rest;
         Oid                     typeid = patt_const->consttype;
         int                     pos,
                                 match_pos;
@@ -5041,13 +5343,12 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
         }
         else
         {
-               bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
+               bytea      *bstr = DatumGetByteaPP(patt_const->constvalue);
  
-               pattlen = VARSIZE(bstr) - VARHDRSZ;
+               pattlen = VARSIZE_ANY_EXHDR(bstr);
                 patt = (char *) palloc(pattlen);
-               memcpy(patt, VARDATA(bstr), pattlen);
-               if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
-                       pfree(bstr);
+               memcpy(patt, VARDATA_ANY(bstr), pattlen);
+               Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
         }
  
         match = palloc(pattlen + 1);
@@ -5076,18 +5377,15 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
         }
  
         match[match_pos] = '\0';
-       rest = &patt[pos];
  
         if (typeid != BYTEAOID)
-       {
                 *prefix_const = string_to_const(match, typeid);
-               *rest_const = string_to_const(rest, typeid);
-       }
         else
-       {
                 *prefix_const = string_to_bytea_const(match, match_pos);
-               *rest_const = string_to_bytea_const(rest, pattlen - pos);
-       }
+
+       if (rest_selec != NULL)
+               *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
+                                                                          case_insensitive);
  
         pfree(patt);
         pfree(match);
@@ -5104,20 +5402,11 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
  
  static Pattern_Prefix_Status
  regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
-                                  Const **prefix_const, Const **rest_const)
+                                  Const **prefix_const, Selectivity *rest_selec)
  {
-       char       *match;
-       int                     pos,
-                               match_pos,
-                               prev_pos,
-                               prev_match_pos;
-       bool            have_leading_paren;
-       char       *patt;
-       char       *rest;
         Oid                     typeid = patt_const->consttype;
-       bool            is_multibyte = (pg_database_encoding_max_length() > 1);
-       pg_locale_t locale = 0;
-       bool            locale_is_c = false;
+       char       *prefix;
+       bool            exact;
  
         /*
          * Should be unnecessary, there are no bytea regex operators defined. As
@@ -5129,201 +5418,79 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                  errmsg("regular-expression matching not supported on type bytea")));
  
-       if (case_insensitive)
-       {
-               /* If case-insensitive, we need locale info */
-               if (lc_ctype_is_c(collation))
-                       locale_is_c = true;
-               else if (collation != DEFAULT_COLLATION_OID)
-               {
-                       if (!OidIsValid(collation))
-                       {
-                               /*
-                                * This typically means that the parser could not resolve a
-                                * conflict of implicit collations, so report it that way.
-                                */
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_INDETERMINATE_COLLATION),
-                                                errmsg("could not determine which collation to use for regular expression"),
-                                                errhint("Use the COLLATE clause to set the collation explicitly.")));
-                       }
-                       locale = pg_newlocale_from_collation(collation);
-               }
-       }
-
-       /* the right-hand const is type text for all of these */
-       patt = TextDatumGetCString(patt_const->constvalue);
-
-       /*
-        * Check for ARE director prefix.  It's worth our trouble to recognize
-        * this because similar_escape() used to use it, and some other code might
-        * still use it, to force ARE mode.
-        */
-       pos = 0;
-       if (strncmp(patt, "***:", 4) == 0)
-               pos = 4;
+       /* Use the regexp machinery to extract the prefix, if any */
+       prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
+                                                                case_insensitive, collation,
+                                                                &exact);
  
-       /* Pattern must be anchored left */
-       if (patt[pos] != '^')
+       if (prefix == NULL)
         {
-               rest = patt;
-
                 *prefix_const = NULL;
-               *rest_const = string_to_const(rest, typeid);
-
-               return Pattern_Prefix_None;
-       }
-       pos++;
  
-       /*
-        * If '|' is present in pattern, then there may be multiple alternatives
-        * for the start of the string.  (There are cases where this isn't so, for
-        * instance if the '|' is inside parens, but detecting that reliably is
-        * too hard.)
-        */
-       if (strchr(patt + pos, '|') != NULL)
-       {
-               rest = patt;
+               if (rest_selec != NULL)
+               {
+                       char       *patt = TextDatumGetCString(patt_const->constvalue);
  
-               *prefix_const = NULL;
-               *rest_const = string_to_const(rest, typeid);
+                       *rest_selec = regex_selectivity(patt, strlen(patt),
+                                                                                       case_insensitive,
+                                                                                       0);
+                       pfree(patt);
+               }
  
                 return Pattern_Prefix_None;
         }
  
-       /* OK, allocate space for pattern */
-       match = palloc(strlen(patt) + 1);
-       prev_match_pos = match_pos = 0;
+       *prefix_const = string_to_const(prefix, typeid);
  
-       /*
-        * We special-case the syntax '^(...)$' because psql uses it.  But beware:
-        * sequences beginning "(?" are not what they seem, unless they're "(?:".
-        * (We must recognize that because of similar_escape().)
-        */
-       have_leading_paren = false;
-       if (patt[pos] == '(' &&
-               (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
+       if (rest_selec != NULL)
         {
-               have_leading_paren = true;
-               pos += (patt[pos + 1] != '?' ? 1 : 3);
-       }
+               if (exact)
+               {
+                       /* Exact match, so there's no additional selectivity */
+                       *rest_selec = 1.0;
+               }
+               else
+               {
+                       char       *patt = TextDatumGetCString(patt_const->constvalue);
  
-       /* Scan remainder of pattern */
-       prev_pos = pos;
-       while (patt[pos])
-       {
-               int                     len;
+                       *rest_selec = regex_selectivity(patt, strlen(patt),
+                                                                                       case_insensitive,
+                                                                                       strlen(prefix));
+                       pfree(patt);
+               }
+       }
  
-               /*
-                * Check for characters that indicate multiple possible matches here.
-                * Also, drop out at ')' or '$' so the termination test works right.
-                */
-               if (patt[pos] == '.' ||
-                       patt[pos] == '(' ||
-                       patt[pos] == ')' ||
-                       patt[pos] == '[' ||
-                       patt[pos] == '^' ||
-                       patt[pos] == '$')
-                       break;
-
-               /* Stop if case-varying character (it's sort of a wildcard) */
-               if (case_insensitive &&
-                 pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
-                       break;
-
-               /*
-                * Check for quantifiers.  Except for +, this means the preceding
-                * character is optional, so we must remove it from the prefix too!
-                */
-               if (patt[pos] == '*' ||
-                       patt[pos] == '?' ||
-                       patt[pos] == '{')
-               {
-                       match_pos = prev_match_pos;
-                       pos = prev_pos;
-                       break;
-               }
-               if (patt[pos] == '+')
-               {
-                       pos = prev_pos;
-                       break;
-               }
-
-               /*
-                * Normally, backslash quotes the next character.  But in AREs,
-                * backslash followed by alphanumeric is an escape, not a quoted
-                * character.  Must treat it as having multiple possible matches.
-                * Note: since only ASCII alphanumerics are escapes, we don't have to
-                * be paranoid about multibyte or collations here.
-                */
-               if (patt[pos] == '\\')
-               {
-                       if (isalnum((unsigned char) patt[pos + 1]))
-                               break;
-                       pos++;
-                       if (patt[pos] == '\0')
-                               break;
-               }
-               /* save position in case we need to back up on next loop cycle */
-               prev_match_pos = match_pos;
-               prev_pos = pos;
-               /* must use encoding-aware processing here */
-               len = pg_mblen(&patt[pos]);
-               memcpy(&match[match_pos], &patt[pos], len);
-               match_pos += len;
-               pos += len;
-       }
-
-       match[match_pos] = '\0';
-       rest = &patt[pos];
-
-       if (have_leading_paren && patt[pos] == ')')
-               pos++;
-
-       if (patt[pos] == '$' && patt[pos + 1] == '\0')
-       {
-               rest = &patt[pos + 1];
-
-               *prefix_const = string_to_const(match, typeid);
-               *rest_const = string_to_const(rest, typeid);
-
-               pfree(patt);
-               pfree(match);
+       pfree(prefix);
  
+       if (exact)
                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
-       }
-
-       *prefix_const = string_to_const(match, typeid);
-       *rest_const = string_to_const(rest, typeid);
-
-       pfree(patt);
-       pfree(match);
-
-       if (match_pos > 0)
+       else
                 return Pattern_Prefix_Partial;
-
-       return Pattern_Prefix_None;
  }
  
  Pattern_Prefix_Status
  pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
-                                        Const **prefix, Const **rest)
+                                        Const **prefix, Selectivity *rest_selec)
  {
         Pattern_Prefix_Status result;
  
         switch (ptype)
         {
                 case Pattern_Type_Like:
-                       result = like_fixed_prefix(patt, false, collation, prefix, rest);
+                       result = like_fixed_prefix(patt, false, collation,
+                                                                          prefix, rest_selec);
                         break;
                 case Pattern_Type_Like_IC:
-                       result = like_fixed_prefix(patt, true, collation, prefix, rest);
+                       result = like_fixed_prefix(patt, true, collation,
+                                                                          prefix, rest_selec);
                         break;
                 case Pattern_Type_Regex:
-                       result = regex_fixed_prefix(patt, false, collation, prefix, rest);
+                       result = regex_fixed_prefix(patt, false, collation,
+                                                                               prefix, rest_selec);
                         break;
                 case Pattern_Type_Regex_IC:
-                       result = regex_fixed_prefix(patt, true, collation, prefix, rest);
+                       result = regex_fixed_prefix(patt, true, collation,
+                                                                               prefix, rest_selec);
                         break;
                 default:
                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
@@ -5344,7 +5511,7 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
   * together with info about MCVs and NULLs.
   *
   * We use the >= and < operators from the specified btree opfamily to do the
- * estimation. The given variable and Const must be of the associated
+ * estimation.  The given variable and Const must be of the associated
   * datatype.
   *
   * XXX Note: we make use of the upper bound to estimate operator selectivity
@@ -5403,7 +5570,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
  
                 /*
                  * Merge the two selectivities in the same way as for a range query
-                * (see clauselist_selectivity()).      Note that we don't need to worry
+                * (see clauselist_selectivity()).  Note that we don't need to worry
                  * about double-exclusion of nulls, since ineq_histogram_selectivity
                  * doesn't count those anyway.
                  */
@@ -5438,7 +5605,8 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
  
  /*
   * Estimate the selectivity of a pattern of the specified type.
- * Note that any fixed prefix of the pattern will have been removed already.
+ * Note that any fixed prefix of the pattern will have been removed already,
+ * so actually we may be looking at just a fragment of the pattern.
   *
   * For now, we use a very simplistic approach: fixed characters reduce the
   * selectivity a good deal, character ranges reduce it a little,
@@ -5452,37 +5620,10 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
  #define PARTIAL_WILDCARD_SEL 2.0
  
  static Selectivity
-like_selectivity(Const *patt_const, bool case_insensitive)
+like_selectivity(const char *patt, int pattlen, bool case_insensitive)
  {
         Selectivity sel = 1.0;
         int                     pos;
-       Oid                     typeid = patt_const->consttype;
-       char       *patt;
-       int                     pattlen;
-
-       /* the right-hand const is type text or bytea */
-       Assert(typeid == BYTEAOID || typeid == TEXTOID);
-
-       if (typeid == BYTEAOID && case_insensitive)
-               ereport(ERROR,
-                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                  errmsg("case insensitive matching not supported on type bytea")));
-
-       if (typeid != BYTEAOID)
-       {
-               patt = TextDatumGetCString(patt_const->constvalue);
-               pattlen = strlen(patt);
-       }
-       else
-       {
-               bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
-
-               pattlen = VARSIZE(bstr) - VARHDRSZ;
-               patt = (char *) palloc(pattlen);
-               memcpy(patt, VARDATA(bstr), pattlen);
-               if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
-                       pfree(bstr);
-       }
  
         /* Skip any leading wildcard; it's already factored into initial sel */
         for (pos = 0; pos < pattlen; pos++)
@@ -5512,13 +5653,11 @@ like_selectivity(Const *patt_const, bool case_insensitive)
         /* Could get sel > 1 if multiple wildcards */
         if (sel > 1.0)
                 sel = 1.0;
-
-       pfree(patt);
         return sel;
  }
  
  static Selectivity
-regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
+regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
  {
         Selectivity sel = 1.0;
         int                     paren_depth = 0;
@@ -5611,26 +5750,10 @@ regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
  }
  
  static Selectivity
-regex_selectivity(Const *patt_const, bool case_insensitive)
+regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
+                                 int fixed_prefix_len)
  {
         Selectivity sel;
-       char       *patt;
-       int                     pattlen;
-       Oid                     typeid = patt_const->consttype;
-
-       /*
-        * Should be unnecessary, there are no bytea regex operators defined. As
-        * such, it should be noted that the rest of this function has *not* been
-        * made safe for binary (possibly NULL containing) strings.
-        */
-       if (typeid == BYTEAOID)
-               ereport(ERROR,
-                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                errmsg("regular-expression matching not supported on type bytea")));
-
-       /* the right-hand const is type text for all of these */
-       patt = TextDatumGetCString(patt_const->constvalue);
-       pattlen = strlen(patt);
  
         /* If patt doesn't end with $, consider it to have a trailing wildcard */
         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
@@ -5644,37 +5767,15 @@ regex_selectivity(Const *patt_const, bool case_insensitive)
                 /* no trailing $ */
                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
                 sel *= FULL_WILDCARD_SEL;
-               if (sel > 1.0)
-                       sel = 1.0;
         }
-       return sel;
-}
  
-static Selectivity
-pattern_selectivity(Const *patt, Pattern_Type ptype)
-{
-       Selectivity result;
+       /* If there's a fixed prefix, discount its selectivity */
+       if (fixed_prefix_len > 0)
+               sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
  
-       switch (ptype)
-       {
-               case Pattern_Type_Like:
-                       result = like_selectivity(patt, false);
-                       break;
-               case Pattern_Type_Like_IC:
-                       result = like_selectivity(patt, true);
-                       break;
-               case Pattern_Type_Regex:
-                       result = regex_selectivity(patt, false);
-                       break;
-               case Pattern_Type_Regex_IC:
-                       result = regex_selectivity(patt, true);
-                       break;
-               default:
-                       elog(ERROR, "unrecognized ptype: %d", (int) ptype);
-                       result = 1.0;           /* keep compiler quiet */
-                       break;
-       }
-       return result;
+       /* Make sure result stays in range */
+       CLAMP_PROBABILITY(sel);
+       return sel;
  }
  
  
@@ -5706,7 +5807,7 @@ byte_increment(unsigned char *ptr, int len)
   * that is not a bulletproof guarantee that an extension of the string might
   * not sort after it; an example is that "foo " is less than "foo!", but it
   * is not clear that a "dictionary" sort ordering will consider "foo!" less
- * than "foo bar".     CAUTION: Therefore, this function should be used only for
+ * than "foo bar".  CAUTION: Therefore, this function should be used only for
   * estimation purposes when working in a non-C collation.
   *
   * To try to catch most cases where an extended string might otherwise sort
@@ -5757,13 +5858,12 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
         }
         else if (datatype == BYTEAOID)
         {
-               bytea      *bstr = DatumGetByteaP(str_const->constvalue);
+               bytea      *bstr = DatumGetByteaPP(str_const->constvalue);
  
-               len = VARSIZE(bstr) - VARHDRSZ;
+               len = VARSIZE_ANY_EXHDR(bstr);
                 workstr = (char *) palloc(len);
-               memcpy(workstr, VARDATA(bstr), len);
-               if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
-                       pfree(bstr);
+               memcpy(workstr, VARDATA_ANY(bstr), len);
+               Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
                 cmpstr = str_const->constvalue;
         }
         else
@@ -5956,78 +6056,189 @@ string_to_bytea_const(const char *str, size_t str_len)
   *
   * Index cost estimation functions
   *
- * genericcostestimate is a general-purpose estimator for use when we
- * don't have any better idea about how to estimate.  Index-type-specific
- * knowledge can be incorporated in the type-specific routines.
- *
- * One bit of index-type-specific knowledge we can relatively easily use
- * in genericcostestimate is the estimate of the number of index tuples
- * visited.  If numIndexTuples is not 0 then it is used as the estimate,
- * otherwise we compute a generic estimate.
- *
   *-------------------------------------------------------------------------
   */
  
-static void
+List *
+deconstruct_indexquals(IndexPath *path)
+{
+       List       *result = NIL;
+       IndexOptInfo *index = path->indexinfo;
+       ListCell   *lcc,
+                          *lci;
+
+       forboth(lcc, path->indexquals, lci, path->indexqualcols)
+       {
+               RestrictInfo *rinfo = castNode(RestrictInfo, lfirst(lcc));
+               int                     indexcol = lfirst_int(lci);
+               Expr       *clause;
+               Node       *leftop,
+                                  *rightop;
+               IndexQualInfo *qinfo;
+
+               clause = rinfo->clause;
+
+               qinfo = (IndexQualInfo *) palloc(sizeof(IndexQualInfo));
+               qinfo->rinfo = rinfo;
+               qinfo->indexcol = indexcol;
+
+               if (IsA(clause, OpExpr))
+               {
+                       qinfo->clause_op = ((OpExpr *) clause)->opno;
+                       leftop = get_leftop(clause);
+                       rightop = get_rightop(clause);
+                       if (match_index_to_operand(leftop, indexcol, index))
+                       {
+                               qinfo->varonleft = true;
+                               qinfo->other_operand = rightop;
+                       }
+                       else
+                       {
+                               Assert(match_index_to_operand(rightop, indexcol, index));
+                               qinfo->varonleft = false;
+                               qinfo->other_operand = leftop;
+                       }
+               }
+               else if (IsA(clause, RowCompareExpr))
+               {
+                       RowCompareExpr *rc = (RowCompareExpr *) clause;
+
+                       qinfo->clause_op = linitial_oid(rc->opnos);
+                       /* Examine only first columns to determine left/right sides */
+                       if (match_index_to_operand((Node *) linitial(rc->largs),
+                                                                          indexcol, index))
+                       {
+                               qinfo->varonleft = true;
+                               qinfo->other_operand = (Node *) rc->rargs;
+                       }
+                       else
+                       {
+                               Assert(match_index_to_operand((Node *) linitial(rc->rargs),
+                                                                                         indexcol, index));
+                               qinfo->varonleft = false;
+                               qinfo->other_operand = (Node *) rc->largs;
+                       }
+               }
+               else if (IsA(clause, ScalarArrayOpExpr))
+               {
+                       ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+
+                       qinfo->clause_op = saop->opno;
+                       /* index column is always on the left in this case */
+                       Assert(match_index_to_operand((Node *) linitial(saop->args),
+                                                                                 indexcol, index));
+                       qinfo->varonleft = true;
+                       qinfo->other_operand = (Node *) lsecond(saop->args);
+               }
+               else if (IsA(clause, NullTest))
+               {
+                       qinfo->clause_op = InvalidOid;
+                       Assert(match_index_to_operand((Node *) ((NullTest *) clause)->arg,
+                                                                                 indexcol, index));
+                       qinfo->varonleft = true;
+                       qinfo->other_operand = NULL;
+               }
+               else
+               {
+                       elog(ERROR, "unsupported indexqual type: %d",
+                                (int) nodeTag(clause));
+               }
+
+               result = lappend(result, qinfo);
+       }
+       return result;
+}
+
+/*
+ * Simple function to compute the total eval cost of the "other operands"
+ * in an IndexQualInfo list.  Since we know these will be evaluated just
+ * once per scan, there's no need to distinguish startup from per-row cost.
+ */
+static Cost
+other_operands_eval_cost(PlannerInfo *root, List *qinfos)
+{
+       Cost            qual_arg_cost = 0;
+       ListCell   *lc;
+
+       foreach(lc, qinfos)
+       {
+               IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(lc);
+               QualCost        index_qual_cost;
+
+               cost_qual_eval_node(&index_qual_cost, qinfo->other_operand, root);
+               qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       }
+       return qual_arg_cost;
+}
+
+/*
+ * Get other-operand eval cost for an index orderby list.
+ *
+ * Index orderby expressions aren't represented as RestrictInfos (since they
+ * aren't boolean, usually).  So we can't apply deconstruct_indexquals to
+ * them.  However, they are much simpler to deal with since they are always
+ * OpExprs and the index column is always on the left.
+ */
+static Cost
+orderby_operands_eval_cost(PlannerInfo *root, IndexPath *path)
+{
+       Cost            qual_arg_cost = 0;
+       ListCell   *lc;
+
+       foreach(lc, path->indexorderbys)
+       {
+               Expr       *clause = (Expr *) lfirst(lc);
+               Node       *other_operand;
+               QualCost        index_qual_cost;
+
+               if (IsA(clause, OpExpr))
+               {
+                       other_operand = get_rightop(clause);
+               }
+               else
+               {
+                       elog(ERROR, "unsupported indexorderby type: %d",
+                                (int) nodeTag(clause));
+                       other_operand = NULL;           /* keep compiler quiet */
+               }
+
+               cost_qual_eval_node(&index_qual_cost, other_operand, root);
+               qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       }
+       return qual_arg_cost;
+}
+
+void
  genericcostestimate(PlannerInfo *root,
                                         IndexPath *path,
-                                       RelOptInfo *outer_rel,
-                                       double numIndexTuples,
-                                       Cost *indexStartupCost,
-                                       Cost *indexTotalCost,
-                                       Selectivity *indexSelectivity,
-                                       double *indexCorrelation)
+                                       double loop_count,
+                                       List *qinfos,
+                                       GenericCosts *costs)
  {
         IndexOptInfo *index = path->indexinfo;
         List       *indexQuals = path->indexquals;
         List       *indexOrderBys = path->indexorderbys;
+       Cost            indexStartupCost;
+       Cost            indexTotalCost;
+       Selectivity indexSelectivity;
+       double          indexCorrelation;
         double          numIndexPages;
+       double          numIndexTuples;
+       double          spc_random_page_cost;
         double          num_sa_scans;
         double          num_outer_scans;
         double          num_scans;
-       QualCost        index_qual_cost;
         double          qual_op_cost;
         double          qual_arg_cost;
-       double          spc_random_page_cost;
         List       *selectivityQuals;
         ListCell   *l;
  
-       /*----------
+       /*
          * If the index is partial, AND the index predicate with the explicitly
          * given indexquals to produce a more accurate idea of the index
-        * selectivity.  However, we need to be careful not to insert redundant
-        * clauses, because clauselist_selectivity() is easily fooled into
-        * computing a too-low selectivity estimate.  Our approach is to add
-        * only the index predicate clause(s) that cannot be proven to be implied
-        * by the given indexquals.  This successfully handles cases such as a
-        * qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
-        * There are many other cases where we won't detect redundancy, leading
-        * to a too-low selectivity estimate, which will bias the system in favor
-        * of using partial indexes where possible.  That is not necessarily bad
-        * though.
-        *
-        * Note that indexQuals contains RestrictInfo nodes while the indpred
-        * does not.  This is OK for both predicate_implied_by() and
-        * clauselist_selectivity().
-        *----------
+        * selectivity.
          */
-       if (index->indpred != NIL)
-       {
-               List       *predExtraQuals = NIL;
-
-               foreach(l, index->indpred)
-               {
-                       Node       *predQual = (Node *) lfirst(l);
-                       List       *oneQual = list_make1(predQual);
-
-                       if (!predicate_implied_by(oneQual, indexQuals))
-                               predExtraQuals = list_concat(predExtraQuals, oneQual);
-               }
-               /* list_concat avoids modifying the passed-in indexQuals list */
-               selectivityQuals = list_concat(predExtraQuals, indexQuals);
-       }
-       else
-               selectivityQuals = indexQuals;
+       selectivityQuals = add_predicate_to_quals(index, indexQuals);
  
         /*
          * Check for ScalarArrayOpExpr index quals, and estimate the number of
@@ -6049,19 +6260,20 @@ genericcostestimate(PlannerInfo *root,
         }
  
         /* Estimate the fraction of main-table tuples that will be visited */
-       *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
-                                                                                          index->rel->relid,
-                                                                                          JOIN_INNER,
-                                                                                          NULL);
+       indexSelectivity = clauselist_selectivity(root, selectivityQuals,
+                                                                                         index->rel->relid,
+                                                                                         JOIN_INNER,
+                                                                                         NULL);
  
         /*
          * If caller didn't give us an estimate, estimate the number of index
          * tuples that will be visited.  We do it in this rather peculiar-looking
          * way in order to get the right answer for partial indexes.
          */
+       numIndexTuples = costs->numIndexTuples;
         if (numIndexTuples <= 0.0)
         {
-               numIndexTuples = *indexSelectivity * index->rel->tuples;
+               numIndexTuples = indexSelectivity * index->rel->tuples;
  
                 /*
                  * The above calculation counts all the tuples visited across all
@@ -6088,16 +6300,19 @@ genericcostestimate(PlannerInfo *root,
          *
          * We use the simplistic method of taking a pro-rata fraction of the total
          * number of index pages.  In effect, this counts only leaf pages and not
-        * any overhead such as index metapage or upper tree levels. In practice
-        * this seems a better approximation than charging for access to the upper
-        * levels, perhaps because those tend to stay in cache under load.
+        * any overhead such as index metapage or upper tree levels.
+        *
+        * In practice access to upper index levels is often nearly free because
+        * those tend to stay in cache under load; moreover, the cost involved is
+        * highly dependent on index type.  We therefore ignore such costs here
+        * and leave it to the caller to add a suitable charge if needed.
          */
         if (index->pages > 1 && index->tuples > 1)
                 numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
         else
                 numIndexPages = 1.0;
  
-       /* fetch estimated page cost for schema containing index */
+       /* fetch estimated page cost for tablespace containing index */
         get_tablespace_page_costs(index->reltablespace,
                                                           &spc_random_page_cost,
                                                           NULL);
@@ -6108,9 +6323,9 @@ genericcostestimate(PlannerInfo *root,
          * The above calculations are all per-index-scan.  However, if we are in a
          * nestloop inner scan, we can expect the scan to be repeated (with
          * different search keys) for each row of the outer relation.  Likewise,
-        * ScalarArrayOpExpr quals result in multiple index scans.      This creates
+        * ScalarArrayOpExpr quals result in multiple index scans.  This creates
          * the potential for cache effects to reduce the number of disk page
-        * fetches needed.      We want to estimate the average per-scan I/O cost in
+        * fetches needed.  We want to estimate the average per-scan I/O cost in
          * the presence of caching.
          *
          * We use the Mackert-Lohman formula (see costsize.c for details) to
@@ -6119,16 +6334,8 @@ genericcostestimate(PlannerInfo *root,
          * Note that we are counting pages not tuples anymore, so we take N = T =
          * index size, as if there were one "tuple" per page.
          */
-       if (outer_rel != NULL && outer_rel->rows > 1)
-       {
-               num_outer_scans = outer_rel->rows;
-               num_scans = num_sa_scans * num_outer_scans;
-       }
-       else
-       {
-               num_outer_scans = 1;
-               num_scans = num_sa_scans;
-       }
+       num_outer_scans = loop_count;
+       num_scans = num_sa_scans * num_outer_scans;
  
         if (num_scans > 1)
         {
@@ -6148,7 +6355,7 @@ genericcostestimate(PlannerInfo *root,
                  * share for each outer scan.  (Don't pro-rate for ScalarArrayOpExpr,
                  * since that's internal to the indexscan.)
                  */
-               *indexTotalCost = (pages_fetched * spc_random_page_cost)
+               indexTotalCost = (pages_fetched * spc_random_page_cost)
                         / num_outer_scans;
         }
         else
@@ -6157,101 +6364,115 @@ genericcostestimate(PlannerInfo *root,
                  * For a single index scan, we just charge spc_random_page_cost per
                  * page touched.
                  */
-               *indexTotalCost = numIndexPages * spc_random_page_cost;
+               indexTotalCost = numIndexPages * spc_random_page_cost;
         }
  
-       /*
-        * A difficulty with the leaf-pages-only cost approach is that for small
-        * selectivities (eg, single index tuple fetched) all indexes will look
-        * equally attractive because we will estimate exactly 1 leaf page to be
-        * fetched.  All else being equal, we should prefer physically smaller
-        * indexes over larger ones.  (An index might be smaller because it is
-        * partial or because it contains fewer columns; presumably the other
-        * columns in the larger index aren't useful to the query, or the larger
-        * index would have better selectivity.)
-        *
-        * We can deal with this by adding a very small "fudge factor" that
-        * depends on the index size.  The fudge factor used here is one
-        * spc_random_page_cost per 100000 index pages, which should be small
-        * enough to not alter index-vs-seqscan decisions, but will prevent
-        * indexes of different sizes from looking exactly equally attractive.
-        */
-       *indexTotalCost += index->pages * spc_random_page_cost / 100000.0;
-
         /*
          * CPU cost: any complex expressions in the indexquals will need to be
          * evaluated once at the start of the scan to reduce them to runtime keys
          * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
          * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
-        * indexqual operator.  Because we have numIndexTuples as a per-scan
+        * indexqual operator.  Because we have numIndexTuples as a per-scan
          * number, we have to multiply by num_sa_scans to get the correct result
          * for ScalarArrayOpExpr cases.  Similarly add in costs for any index
          * ORDER BY expressions.
          *
-        * Note: this neglects the possible costs of rechecking lossy operators
-        * and OR-clause expressions.  Detecting that that might be needed seems
-        * more expensive than it's worth, though, considering all the other
-        * inaccuracies here ...
+        * Note: this neglects the possible costs of rechecking lossy operators.
+        * Detecting that that might be needed seems more expensive than it's
+        * worth, though, considering all the other inaccuracies here ...
          */
-       cost_qual_eval(&index_qual_cost, indexQuals, root);
-       qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
-       cost_qual_eval(&index_qual_cost, indexOrderBys, root);
-       qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       qual_arg_cost = other_operands_eval_cost(root, qinfos) +
+               orderby_operands_eval_cost(root, path);
         qual_op_cost = cpu_operator_cost *
                 (list_length(indexQuals) + list_length(indexOrderBys));
-       qual_arg_cost -= qual_op_cost;
-       if (qual_arg_cost < 0)          /* just in case... */
-               qual_arg_cost = 0;
  
-       *indexStartupCost = qual_arg_cost;
-       *indexTotalCost += qual_arg_cost;
-       *indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
+       indexStartupCost = qual_arg_cost;
+       indexTotalCost += qual_arg_cost;
+       indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
  
         /*
-        * We also add a CPU-cost component to represent the general costs of
-        * starting an indexscan, such as analysis of btree index keys and initial
-        * tree descent.  This is estimated at 100x cpu_operator_cost, which is a
-        * bit arbitrary but seems the right order of magnitude. (As noted above,
-        * we don't charge any I/O for touching upper tree levels, but charging
-        * nothing at all has been found too optimistic.)
-        *
-        * Although this is startup cost with respect to any one scan, we add it
-        * to the "total" cost component because it's only very interesting in the
-        * many-ScalarArrayOpExpr-scan case, and there it will be paid over the
-        * life of the scan node.
+        * Generic assumption about index correlation: there isn't any.
          */
-       *indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost;
+       indexCorrelation = 0.0;
  
         /*
-        * Generic assumption about index correlation: there isn't any.
+        * Return everything to caller.
          */
-       *indexCorrelation = 0.0;
+       costs->indexStartupCost = indexStartupCost;
+       costs->indexTotalCost = indexTotalCost;
+       costs->indexSelectivity = indexSelectivity;
+       costs->indexCorrelation = indexCorrelation;
+       costs->numIndexPages = numIndexPages;
+       costs->numIndexTuples = numIndexTuples;
+       costs->spc_random_page_cost = spc_random_page_cost;
+       costs->num_sa_scans = num_sa_scans;
  }
  
+/*
+ * If the index is partial, add its predicate to the given qual list.
+ *
+ * ANDing the index predicate with the explicitly given indexquals produces
+ * a more accurate idea of the index's selectivity.  However, we need to be
+ * careful not to insert redundant clauses, because clauselist_selectivity()
+ * is easily fooled into computing a too-low selectivity estimate.  Our
+ * approach is to add only the predicate clause(s) that cannot be proven to
+ * be implied by the given indexquals.  This successfully handles cases such
+ * as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
+ * There are many other cases where we won't detect redundancy, leading to a
+ * too-low selectivity estimate, which will bias the system in favor of using
+ * partial indexes where possible.  That is not necessarily bad though.
+ *
+ * Note that indexQuals contains RestrictInfo nodes while the indpred
+ * does not, so the output list will be mixed.  This is OK for both
+ * predicate_implied_by() and clauselist_selectivity(), but might be
+ * problematic if the result were passed to other things.
+ */
+static List *
+add_predicate_to_quals(IndexOptInfo *index, List *indexQuals)
+{
+       List       *predExtraQuals = NIL;
+       ListCell   *lc;
  
-Datum
-btcostestimate(PG_FUNCTION_ARGS)
+       if (index->indpred == NIL)
+               return indexQuals;
+
+       foreach(lc, index->indpred)
+       {
+               Node       *predQual = (Node *) lfirst(lc);
+               List       *oneQual = list_make1(predQual);
+
+               if (!predicate_implied_by(oneQual, indexQuals))
+                       predExtraQuals = list_concat(predExtraQuals, oneQual);
+       }
+       /* list_concat avoids modifying the passed-in indexQuals list */
+       return list_concat(predExtraQuals, indexQuals);
+}
+
+
+void
+btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                          Cost *indexStartupCost, Cost *indexTotalCost,
+                          Selectivity *indexSelectivity, double *indexCorrelation,
+                          double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
         IndexOptInfo *index = path->indexinfo;
+       List       *qinfos;
+       GenericCosts costs;
         Oid                     relid;
         AttrNumber      colnum;
         VariableStatData vardata;
         double          numIndexTuples;
+       Cost            descentCost;
         List       *indexBoundQuals;
         int                     indexcol;
         bool            eqQualHere;
         bool            found_saop;
         bool            found_is_null_op;
         double          num_sa_scans;
-       ListCell   *lcc,
-                          *lci;
+       ListCell   *lc;
+
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
  
         /*
          * For a btree scan, only leading '=' quals plus inequality quals for the
@@ -6276,83 +6497,52 @@ btcostestimate(PG_FUNCTION_ARGS)
         found_saop = false;
         found_is_null_op = false;
         num_sa_scans = 1;
-       forboth(lcc, path->indexquals, lci, path->indexqualcols)
+       foreach(lc, qinfos)
         {
-               RestrictInfo *rinfo = (RestrictInfo *) lfirst(lcc);
-               Expr       *clause;
-               Node       *leftop,
-                                  *rightop;
+               IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(lc);
+               RestrictInfo *rinfo = qinfo->rinfo;
+               Expr       *clause = rinfo->clause;
                 Oid                     clause_op;
                 int                     op_strategy;
-               bool            is_null_op = false;
  
-               if (indexcol != lfirst_int(lci))
+               if (indexcol != qinfo->indexcol)
                 {
                         /* Beginning of a new column's quals */
                         if (!eqQualHere)
                                 break;                  /* done if no '=' qual for indexcol */
                         eqQualHere = false;
                         indexcol++;
-                       if (indexcol != lfirst_int(lci))
+                       if (indexcol != qinfo->indexcol)
                                 break;                  /* no quals at all for indexcol */
                 }
  
-               Assert(IsA(rinfo, RestrictInfo));
-               clause = rinfo->clause;
-
-               if (IsA(clause, OpExpr))
-               {
-                       leftop = get_leftop(clause);
-                       rightop = get_rightop(clause);
-                       clause_op = ((OpExpr *) clause)->opno;
-               }
-               else if (IsA(clause, RowCompareExpr))
-               {
-                       RowCompareExpr *rc = (RowCompareExpr *) clause;
-
-                       leftop = (Node *) linitial(rc->largs);
-                       rightop = (Node *) linitial(rc->rargs);
-                       clause_op = linitial_oid(rc->opnos);
-               }
-               else if (IsA(clause, ScalarArrayOpExpr))
+               if (IsA(clause, ScalarArrayOpExpr))
                 {
-                       ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+                       int                     alength = estimate_array_length(qinfo->other_operand);
  
-                       leftop = (Node *) linitial(saop->args);
-                       rightop = (Node *) lsecond(saop->args);
-                       clause_op = saop->opno;
                         found_saop = true;
+                       /* count up number of SA scans induced by indexBoundQuals only */
+                       if (alength > 1)
+                               num_sa_scans *= alength;
                 }
                 else if (IsA(clause, NullTest))
                 {
                         NullTest   *nt = (NullTest *) clause;
  
-                       leftop = (Node *) nt->arg;
-                       rightop = NULL;
-                       clause_op = InvalidOid;
                         if (nt->nulltesttype == IS_NULL)
                         {
                                 found_is_null_op = true;
-                               is_null_op = true;
+                               /* IS NULL is like = for selectivity determination purposes */
+                               eqQualHere = true;
                         }
                 }
-               else
-               {
-                       elog(ERROR, "unsupported indexqual type: %d",
-                                (int) nodeTag(clause));
-                       continue;                       /* keep compiler quiet */
-               }
  
-               if (match_index_to_operand(leftop, indexcol, index))
-               {
-                       /* clause_op is correct */
-               }
-               else
-               {
-                       Assert(match_index_to_operand(rightop, indexcol, index));
-                       /* Must flip operator to get the opfamily member */
-                       clause_op = get_commutator(clause_op);
-               }
+               /*
+                * We would need to commute the clause_op if not varonleft, except
+                * that we only care if it's equality or not, so that refinement is
+                * unnecessary.
+                */
+               clause_op = qinfo->clause_op;
  
                 /* check for equality operator */
                 if (OidIsValid(clause_op))
@@ -6363,20 +6553,7 @@ btcostestimate(PG_FUNCTION_ARGS)
                         if (op_strategy == BTEqualStrategyNumber)
                                 eqQualHere = true;
                 }
-               else if (is_null_op)
-               {
-                       /* IS NULL is like = for purposes of selectivity determination */
-                       eqQualHere = true;
-               }
-               /* count up number of SA scans induced by indexBoundQuals only */
-               if (IsA(clause, ScalarArrayOpExpr))
-               {
-                       ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
-                       int                     alength = estimate_array_length(lsecond(saop->args));
  
-                       if (alength > 1)
-                               num_sa_scans *= alength;
-               }
                 indexBoundQuals = lappend(indexBoundQuals, rinfo);
         }
  
@@ -6394,9 +6571,17 @@ btcostestimate(PG_FUNCTION_ARGS)
                 numIndexTuples = 1.0;
         else
         {
+               List       *selectivityQuals;
                 Selectivity btreeSelectivity;
  
-               btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
+               /*
+                * If the index is partial, AND the index predicate with the
+                * index-bound quals to produce a more accurate idea of the number of
+                * rows covered by the bound conditions.
+                */
+               selectivityQuals = add_predicate_to_quals(index, indexBoundQuals);
+
+               btreeSelectivity = clauselist_selectivity(root, selectivityQuals,
                                                                                                   index->rel->relid,
                                                                                                   JOIN_INNER,
                                                                                                   NULL);
@@ -6410,10 +6595,45 @@ btcostestimate(PG_FUNCTION_ARGS)
                 numIndexTuples = rint(numIndexTuples / num_sa_scans);
         }
  
-       genericcostestimate(root, path, outer_rel,
-                                               numIndexTuples,
-                                               indexStartupCost, indexTotalCost,
-                                               indexSelectivity, indexCorrelation);
+       /*
+        * Now do generic index cost estimation.
+        */
+       MemSet(&costs, 0, sizeof(costs));
+       costs.numIndexTuples = numIndexTuples;
+
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
+
+       /*
+        * Add a CPU-cost component to represent the costs of initial btree
+        * descent.  We don't charge any I/O cost for touching upper btree levels,
+        * since they tend to stay in cache, but we still have to do about log2(N)
+        * comparisons to descend a btree of N leaf tuples.  We charge one
+        * cpu_operator_cost per comparison.
+        *
+        * If there are ScalarArrayOpExprs, charge this once per SA scan.  The
+        * ones after the first one are not startup cost so far as the overall
+        * plan is concerned, so add them only to "total" cost.
+        */
+       if (index->tuples > 1)          /* avoid computing log(0) */
+       {
+               descentCost = ceil(log(index->tuples) / log(2.0)) * cpu_operator_cost;
+               costs.indexStartupCost += descentCost;
+               costs.indexTotalCost += costs.num_sa_scans * descentCost;
+       }
+
+       /*
+        * Even though we're not charging I/O cost for touching upper btree pages,
+        * it's still reasonable to charge some CPU cost per page descended
+        * through.  Moreover, if we had no such charge at all, bloated indexes
+        * would appear to have the same search cost as unbloated ones, at least
+        * in cases where only a single leaf page is expected to be visited.  This
+        * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page
+        * touched.  The number of such pages is btree tree height plus one (ie,
+        * we charge for the leaf page too).  As above, charge once per SA scan.
+        */
+       descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
+       costs.indexStartupCost += descentCost;
+       costs.indexTotalCost += costs.num_sa_scans * descentCost;
  
         /*
          * If we can get an estimate of the first column's ordering correlation C
@@ -6509,9 +6729,9 @@ btcostestimate(PG_FUNCTION_ARGS)
                                 varCorrelation = -varCorrelation;
  
                         if (index->ncolumns > 1)
-                               *indexCorrelation = varCorrelation * 0.75;
+                               costs.indexCorrelation = varCorrelation * 0.75;
                         else
-                               *indexCorrelation = varCorrelation;
+                               costs.indexCorrelation = varCorrelation;
  
                         free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
                 }
@@ -6519,61 +6739,181 @@ btcostestimate(PG_FUNCTION_ARGS)
  
         ReleaseVariableStats(vardata);
  
-       PG_RETURN_VOID();
+       *indexStartupCost = costs.indexStartupCost;
+       *indexTotalCost = costs.indexTotalCost;
+       *indexSelectivity = costs.indexSelectivity;
+       *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
-Datum
-hashcostestimate(PG_FUNCTION_ARGS)
+void
+hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                                Cost *indexStartupCost, Cost *indexTotalCost,
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
-
-       genericcostestimate(root, path, outer_rel, 0.0,
-                                               indexStartupCost, indexTotalCost,
-                                               indexSelectivity, indexCorrelation);
-
-       PG_RETURN_VOID();
+       List       *qinfos;
+       GenericCosts costs;
+
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
+       MemSet(&costs, 0, sizeof(costs));
+
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
+
+       /*
+        * A hash index has no descent costs as such, since the index AM can go
+        * directly to the target bucket after computing the hash value.  There
+        * are a couple of other hash-specific costs that we could conceivably add
+        * here, though:
+        *
+        * Ideally we'd charge spc_random_page_cost for each page in the target
+        * bucket, not just the numIndexPages pages that genericcostestimate
+        * thought we'd visit.  However in most cases we don't know which bucket
+        * that will be.  There's no point in considering the average bucket size
+        * because the hash AM makes sure that's always one page.
+        *
+        * Likewise, we could consider charging some CPU for each index tuple in
+        * the bucket, if we knew how many there were.  But the per-tuple cost is
+        * just a hash value comparison, not a general datatype-dependent
+        * comparison, so any such charge ought to be quite a bit less than
+        * cpu_operator_cost; which makes it probably not worth worrying about.
+        *
+        * A bigger issue is that chance hash-value collisions will result in
+        * wasted probes into the heap.  We don't currently attempt to model this
+        * cost on the grounds that it's rare, but maybe it's not rare enough.
+        * (Any fix for this ought to consider the generic lossy-operator problem,
+        * though; it's not entirely hash-specific.)
+        */
+
+       *indexStartupCost = costs.indexStartupCost;
+       *indexTotalCost = costs.indexTotalCost;
+       *indexSelectivity = costs.indexSelectivity;
+       *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
-Datum
-gistcostestimate(PG_FUNCTION_ARGS)
+void
+gistcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                                Cost *indexStartupCost, Cost *indexTotalCost,
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
-
-       genericcostestimate(root, path, outer_rel, 0.0,
-                                               indexStartupCost, indexTotalCost,
-                                               indexSelectivity, indexCorrelation);
-
-       PG_RETURN_VOID();
+       IndexOptInfo *index = path->indexinfo;
+       List       *qinfos;
+       GenericCosts costs;
+       Cost            descentCost;
+
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
+       MemSet(&costs, 0, sizeof(costs));
+
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
+
+       /*
+        * We model index descent costs similarly to those for btree, but to do
+        * that we first need an idea of the tree height.  We somewhat arbitrarily
+        * assume that the fanout is 100, meaning the tree height is at most
+        * log100(index->pages).
+        *
+        * Although this computation isn't really expensive enough to require
+        * caching, we might as well use index->tree_height to cache it.
+        */
+       if (index->tree_height < 0) /* unknown? */
+       {
+               if (index->pages > 1)   /* avoid computing log(0) */
+                       index->tree_height = (int) (log(index->pages) / log(100.0));
+               else
+                       index->tree_height = 0;
+       }
+
+       /*
+        * Add a CPU-cost component to represent the costs of initial descent. We
+        * just use log(N) here not log2(N) since the branching factor isn't
+        * necessarily two anyway.  As for btree, charge once per SA scan.
+        */
+       if (index->tuples > 1)          /* avoid computing log(0) */
+       {
+               descentCost = ceil(log(index->tuples)) * cpu_operator_cost;
+               costs.indexStartupCost += descentCost;
+               costs.indexTotalCost += costs.num_sa_scans * descentCost;
+       }
+
+       /*
+        * Likewise add a per-page charge, calculated the same as for btrees.
+        */
+       descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
+       costs.indexStartupCost += descentCost;
+       costs.indexTotalCost += costs.num_sa_scans * descentCost;
+
+       *indexStartupCost = costs.indexStartupCost;
+       *indexTotalCost = costs.indexTotalCost;
+       *indexSelectivity = costs.indexSelectivity;
+       *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
-Datum
-spgcostestimate(PG_FUNCTION_ARGS)
+void
+spgcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                               Cost *indexStartupCost, Cost *indexTotalCost,
+                               Selectivity *indexSelectivity, double *indexCorrelation,
+                               double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
-
-       genericcostestimate(root, path, outer_rel, 0.0,
-                                               indexStartupCost, indexTotalCost,
-                                               indexSelectivity, indexCorrelation);
-
-       PG_RETURN_VOID();
+       IndexOptInfo *index = path->indexinfo;
+       List       *qinfos;
+       GenericCosts costs;
+       Cost            descentCost;
+
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
+       MemSet(&costs, 0, sizeof(costs));
+
+       genericcostestimate(root, path, loop_count, qinfos, &costs);
+
+       /*
+        * We model index descent costs similarly to those for btree, but to do
+        * that we first need an idea of the tree height.  We somewhat arbitrarily
+        * assume that the fanout is 100, meaning the tree height is at most
+        * log100(index->pages).
+        *
+        * Although this computation isn't really expensive enough to require
+        * caching, we might as well use index->tree_height to cache it.
+        */
+       if (index->tree_height < 0) /* unknown? */
+       {
+               if (index->pages > 1)   /* avoid computing log(0) */
+                       index->tree_height = (int) (log(index->pages) / log(100.0));
+               else
+                       index->tree_height = 0;
+       }
+
+       /*
+        * Add a CPU-cost component to represent the costs of initial descent. We
+        * just use log(N) here not log2(N) since the branching factor isn't
+        * necessarily two anyway.  As for btree, charge once per SA scan.
+        */
+       if (index->tuples > 1)          /* avoid computing log(0) */
+       {
+               descentCost = ceil(log(index->tuples)) * cpu_operator_cost;
+               costs.indexStartupCost += descentCost;
+               costs.indexTotalCost += costs.num_sa_scans * descentCost;
+       }
+
+       /*
+        * Likewise add a per-page charge, calculated the same as for btrees.
+        */
+       descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
+       costs.indexStartupCost += descentCost;
+       costs.indexTotalCost += costs.num_sa_scans * descentCost;
+
+       *indexStartupCost = costs.indexStartupCost;
+       *indexTotalCost = costs.indexTotalCost;
+       *indexSelectivity = costs.indexSelectivity;
+       *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
  
@@ -6590,21 +6930,6 @@ typedef struct
         double          arrayScans;
  } GinQualCounts;
  
-/* Find the index column matching "op"; return its index, or -1 if no match */
-static int
-find_index_column(Node *op, IndexOptInfo *index)
-{
-       int                     i;
-
-       for (i = 0; i < index->ncolumns; i++)
-       {
-               if (match_index_to_operand(op, i, index))
-                       return i;
-       }
-
-       return -1;
-}
-
  /*
   * Estimate the number of index terms that need to be searched for while
   * testing the given GIN query, and increment the counts in *counts
@@ -6616,6 +6941,7 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
                                 GinQualCounts *counts)
  {
         Oid                     extractProcOid;
+       Oid                     collation;
         int                     strategy_op;
         Oid                     lefttype,
                                 righttype;
@@ -6627,17 +6953,17 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
         int32           i;
  
         /*
-        * Get the operator's strategy number and declared input data types
-        * within the index opfamily.  (We don't need the latter, but we use
-        * get_op_opfamily_properties because it will throw error if it fails
-        * to find a matching pg_amop entry.)
+        * Get the operator's strategy number and declared input data types within
+        * the index opfamily.  (We don't need the latter, but we use
+        * get_op_opfamily_properties because it will throw error if it fails to
+        * find a matching pg_amop entry.)
          */
         get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false,
                                                            &strategy_op, &lefttype, &righttype);
  
         /*
-        * GIN always uses the "default" support functions, which are those
-        * with lefttype == righttype == the opclass' opcintype (see
+        * GIN always uses the "default" support functions, which are those with
+        * lefttype == righttype == the opclass' opcintype (see
          * IndexSupportInitialize in relcache.c).
          */
         extractProcOid = get_opfamily_proc(index->opfamily[indexcol],
@@ -6653,14 +6979,23 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
                          get_rel_name(index->indexoid));
         }
  
-       OidFunctionCall7(extractProcOid,
-                                        query,
-                                        PointerGetDatum(&nentries),
-                                        UInt16GetDatum(strategy_op),
-                                        PointerGetDatum(&partial_matches),
-                                        PointerGetDatum(&extra_data),
-                                        PointerGetDatum(&nullFlags),
-                                        PointerGetDatum(&searchMode));
+       /*
+        * Choose collation to pass to extractProc (should match initGinState).
+        */
+       if (OidIsValid(index->indexcollations[indexcol]))
+               collation = index->indexcollations[indexcol];
+       else
+               collation = DEFAULT_COLLATION_OID;
+
+       OidFunctionCall7Coll(extractProcOid,
+                                                collation,
+                                                query,
+                                                PointerGetDatum(&nentries),
+                                                UInt16GetDatum(strategy_op),
+                                                PointerGetDatum(&partial_matches),
+                                                PointerGetDatum(&extra_data),
+                                                PointerGetDatum(&nullFlags),
+                                                PointerGetDatum(&searchMode));
  
         if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT)
         {
@@ -6703,37 +7038,31 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
   * appropriately.  If the query is unsatisfiable, return false.
   */
  static bool
-gincost_opexpr(IndexOptInfo *index, OpExpr *clause, GinQualCounts *counts)
+gincost_opexpr(PlannerInfo *root,
+                          IndexOptInfo *index,
+                          IndexQualInfo *qinfo,
+                          GinQualCounts *counts)
  {
-       Node       *leftop = get_leftop((Expr *) clause);
-       Node       *rightop = get_rightop((Expr *) clause);
-       Oid                     clause_op = clause->opno;
-       int                     indexcol;
-       Node       *operand;
+       int                     indexcol = qinfo->indexcol;
+       Oid                     clause_op = qinfo->clause_op;
+       Node       *operand = qinfo->other_operand;
  
-       /* Locate the operand being compared to the index column */
-       if ((indexcol = find_index_column(leftop, index)) >= 0)
-       {
-               operand = rightop;
-       }
-       else if ((indexcol = find_index_column(rightop, index)) >= 0)
+       if (!qinfo->varonleft)
         {
-               operand = leftop;
+               /* must commute the operator */
                 clause_op = get_commutator(clause_op);
         }
-       else
-       {
-               elog(ERROR, "could not match index to operand");
-               operand = NULL;         /* keep compiler quiet */
-       }
+
+       /* aggressively reduce to a constant, and look through relabeling */
+       operand = estimate_expression_value(root, operand);
  
         if (IsA(operand, RelabelType))
                 operand = (Node *) ((RelabelType *) operand)->arg;
  
         /*
          * It's impossible to call extractQuery method for unknown operand. So
-        * unless operand is a Const we can't do much; just assume there will
-        * be one ordinary search entry from the operand at runtime.
+        * unless operand is a Const we can't do much; just assume there will be
+        * one ordinary search entry from the operand at runtime.
          */
         if (!IsA(operand, Const))
         {
@@ -6765,14 +7094,15 @@ gincost_opexpr(IndexOptInfo *index, OpExpr *clause, GinQualCounts *counts)
   * by N, causing gincostestimate to scale up its estimates accordingly.
   */
  static bool
-gincost_scalararrayopexpr(IndexOptInfo *index, ScalarArrayOpExpr *clause,
+gincost_scalararrayopexpr(PlannerInfo *root,
+                                                 IndexOptInfo *index,
+                                                 IndexQualInfo *qinfo,
                                                   double numIndexEntries,
                                                   GinQualCounts *counts)
  {
-       Node       *leftop = (Node *) linitial(clause->args);
-       Node       *rightop = (Node *) lsecond(clause->args);
-       Oid                     clause_op = clause->opno;
-       int                     indexcol;
+       int                     indexcol = qinfo->indexcol;
+       Oid                     clause_op = qinfo->clause_op;
+       Node       *rightop = qinfo->other_operand;
         ArrayType  *arrayval;
         int16           elmlen;
         bool            elmbyval;
@@ -6784,20 +7114,19 @@ gincost_scalararrayopexpr(IndexOptInfo *index, ScalarArrayOpExpr *clause,
         int                     numPossible = 0;
         int                     i;
  
-       Assert(clause->useOr);
+       Assert(((ScalarArrayOpExpr *) qinfo->rinfo->clause)->useOr);
  
-       /* index column must be on the left */
-       if ((indexcol = find_index_column(leftop, index)) < 0)
-               elog(ERROR, "could not match index to operand");
+       /* aggressively reduce to a constant, and look through relabeling */
+       rightop = estimate_expression_value(root, rightop);
  
         if (IsA(rightop, RelabelType))
                 rightop = (Node *) ((RelabelType *) rightop)->arg;
  
         /*
          * It's impossible to call extractQuery method for unknown operand. So
-        * unless operand is a Const we can't do much; just assume there will
-        * be one ordinary search entry from each array entry at runtime, and
-        * fall back on a probably-bad estimate of the number of array entries.
+        * unless operand is a Const we can't do much; just assume there will be
+        * one ordinary search entry from each array entry at runtime, and fall
+        * back on a probably-bad estimate of the number of array entries.
          */
         if (!IsA(rightop, Const))
         {
@@ -6879,19 +7208,16 @@ gincost_scalararrayopexpr(IndexOptInfo *index, ScalarArrayOpExpr *clause,
  /*
   * GIN has search behavior completely different from other index types
   */
-Datum
-gincostestimate(PG_FUNCTION_ARGS)
+void
+gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                               Cost *indexStartupCost, Cost *indexTotalCost,
+                               Selectivity *indexSelectivity, double *indexCorrelation,
+                               double *indexPages)
  {
-       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-       IndexPath  *path = (IndexPath *) PG_GETARG_POINTER(1);
-       RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(2);
-       Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-       Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-       Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-       double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
         IndexOptInfo *index = path->indexinfo;
         List       *indexQuals = path->indexquals;
         List       *indexOrderBys = path->indexorderbys;
+       List       *qinfos;
         ListCell   *l;
         List       *selectivityQuals;
         double          numPages = index->pages,
@@ -6902,6 +7228,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                                 numEntries;
         GinQualCounts counts;
         bool            matchPossible;
+       double          partialScale;
         double          entryPagesFetched,
                                 dataPagesFetched,
                                 dataPagesFetchedBySel;
@@ -6909,44 +7236,81 @@ gincostestimate(PG_FUNCTION_ARGS)
                                 qual_arg_cost,
                                 spc_random_page_cost,
                                 outer_scans;
-       QualCost        index_qual_cost;
         Relation        indexRel;
         GinStatsData ginStats;
  
-       /*
-        * Obtain statistic information from the meta page
-        */
-       indexRel = index_open(index->indexoid, AccessShareLock);
-       ginGetStats(indexRel, &ginStats);
-       index_close(indexRel, AccessShareLock);
-
-       numEntryPages = ginStats.nEntryPages;
-       numDataPages = ginStats.nDataPages;
-       numPendingPages = ginStats.nPendingPages;
-       numEntries = ginStats.nEntries;
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
  
         /*
-        * nPendingPages can be trusted, but the other fields are as of the last
-        * VACUUM.      Scale them by the ratio numPages / nTotalPages to account for
-        * growth since then.  If the fields are zero (implying no VACUUM at all,
-        * and an index created pre-9.1), assume all pages are entry pages.
+        * Obtain statistical information from the meta page, if possible.  Else
+        * set ginStats to zeroes, and we'll cope below.
          */
-       if (ginStats.nTotalPages == 0 || ginStats.nEntryPages == 0)
+       if (!index->hypothetical)
         {
-               numEntryPages = numPages;
-               numDataPages = 0;
-               numEntries = numTuples; /* bogus, but no other info available */
+               indexRel = index_open(index->indexoid, AccessShareLock);
+               ginGetStats(indexRel, &ginStats);
+               index_close(indexRel, AccessShareLock);
         }
         else
         {
+               memset(&ginStats, 0, sizeof(ginStats));
+       }
+
+       /*
+        * Assuming we got valid (nonzero) stats at all, nPendingPages can be
+        * trusted, but the other fields are data as of the last VACUUM.  We can
+        * scale them up to account for growth since then, but that method only
+        * goes so far; in the worst case, the stats might be for a completely
+        * empty index, and scaling them will produce pretty bogus numbers.
+        * Somewhat arbitrarily, set the cutoff for doing scaling at 4X growth; if
+        * it's grown more than that, fall back to estimating things only from the
+        * assumed-accurate index size.  But we'll trust nPendingPages in any case
+        * so long as it's not clearly insane, ie, more than the index size.
+        */
+       if (ginStats.nPendingPages < numPages)
+               numPendingPages = ginStats.nPendingPages;
+       else
+               numPendingPages = 0;
+
+       if (numPages > 0 && ginStats.nTotalPages <= numPages &&
+               ginStats.nTotalPages > numPages / 4 &&
+               ginStats.nEntryPages > 0 && ginStats.nEntries > 0)
+       {
+               /*
+                * OK, the stats seem close enough to sane to be trusted.  But we
+                * still need to scale them by the ratio numPages / nTotalPages to
+                * account for growth since the last VACUUM.
+                */
                 double          scale = numPages / ginStats.nTotalPages;
  
-               numEntryPages = ceil(numEntryPages * scale);
-               numDataPages = ceil(numDataPages * scale);
-               numEntries = ceil(numEntries * scale);
+               numEntryPages = ceil(ginStats.nEntryPages * scale);
+               numDataPages = ceil(ginStats.nDataPages * scale);
+               numEntries = ceil(ginStats.nEntries * scale);
                 /* ensure we didn't round up too much */
-               numEntryPages = Min(numEntryPages, numPages);
-               numDataPages = Min(numDataPages, numPages - numEntryPages);
+               numEntryPages = Min(numEntryPages, numPages - numPendingPages);
+               numDataPages = Min(numDataPages,
+                                                  numPages - numPendingPages - numEntryPages);
+       }
+       else
+       {
+               /*
+                * We might get here because it's a hypothetical index, or an index
+                * created pre-9.1 and never vacuumed since upgrading (in which case
+                * its stats would read as zeroes), or just because it's grown too
+                * much since the last VACUUM for us to put our faith in scaling.
+                *
+                * Invent some plausible internal statistics based on the index page
+                * count (and clamp that to at least 10 pages, just in case).  We
+                * estimate that 90% of the index is entry pages, and the rest is data
+                * pages.  Estimate 100 entries per entry page; this is rather bogus
+                * since it'll depend on the size of the keys, but it's more robust
+                * than trying to predict the number of entries per heap tuple.
+                */
+               numPages = Max(numPages, 10);
+               numEntryPages = floor((numPages - numPendingPages) * 0.90);
+               numDataPages = numPages - numPendingPages - numEntryPages;
+               numEntries = floor(numEntryPages * 100);
         }
  
         /* In an empty index, numEntries could be zero.  Avoid divide-by-zero */
@@ -6981,7 +7345,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                                                                                            JOIN_INNER,
                                                                                            NULL);
  
-       /* fetch estimated page cost for schema containing index */
+       /* fetch estimated page cost for tablespace containing index */
         get_tablespace_page_costs(index->reltablespace,
                                                           &spc_random_page_cost,
                                                           NULL);
@@ -6998,25 +7362,25 @@ gincostestimate(PG_FUNCTION_ARGS)
         counts.arrayScans = 1;
         matchPossible = true;
  
-       foreach(l, indexQuals)
+       foreach(l, qinfos)
         {
-               RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
-               Expr       *clause;
+               IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(l);
+               Expr       *clause = qinfo->rinfo->clause;
  
-               Assert(IsA(rinfo, RestrictInfo));
-               clause = rinfo->clause;
                 if (IsA(clause, OpExpr))
                 {
-                       matchPossible = gincost_opexpr(index,
-                                                                                  (OpExpr *) clause,
+                       matchPossible = gincost_opexpr(root,
+                                                                                  index,
+                                                                                  qinfo,
                                                                                    &counts);
                         if (!matchPossible)
                                 break;
                 }
                 else if (IsA(clause, ScalarArrayOpExpr))
                 {
-                       matchPossible = gincost_scalararrayopexpr(index,
-                                                                                                         (ScalarArrayOpExpr *) clause,
+                       matchPossible = gincost_scalararrayopexpr(root,
+                                                                                                         index,
+                                                                                                         qinfo,
                                                                                                           numEntries,
                                                                                                           &counts);
                         if (!matchPossible)
@@ -7036,7 +7400,7 @@ gincostestimate(PG_FUNCTION_ARGS)
                 *indexStartupCost = 0;
                 *indexTotalCost = 0;
                 *indexSelectivity = 0;
-               PG_RETURN_VOID();
+               return;
         }
  
         if (counts.haveFullScan || indexQuals == NIL)
@@ -7051,13 +7415,11 @@ gincostestimate(PG_FUNCTION_ARGS)
         }
  
         /* Will we have more than one iteration of a nestloop scan? */
-       if (outer_rel != NULL && outer_rel->rows > 1)
-               outer_scans = outer_rel->rows;
-       else
-               outer_scans = 1;
+       outer_scans = loop_count;
  
         /*
-        * Compute cost to begin scan, first of all, pay attention to pending list.
+        * Compute cost to begin scan, first of all, pay attention to pending
+        * list.
          */
         entryPagesFetched = numPendingPages;
  
@@ -7072,17 +7434,22 @@ gincostestimate(PG_FUNCTION_ARGS)
  
         /*
          * Add an estimate of entry pages read by partial match algorithm. It's a
-        * scan over leaf pages in entry tree.  We haven't any useful stats here,
-        * so estimate it as proportion.
+        * scan over leaf pages in entry tree.  We haven't any useful stats here,
+        * so estimate it as proportion.  Because counts.partialEntries is really
+        * pretty bogus (see code above), it's possible that it is more than
+        * numEntries; clamp the proportion to ensure sanity.
          */
-       entryPagesFetched += ceil(numEntryPages * counts.partialEntries / numEntries);
+       partialScale = counts.partialEntries / numEntries;
+       partialScale = Min(partialScale, 1.0);
+
+       entryPagesFetched += ceil(numEntryPages * partialScale);
  
         /*
          * Partial match algorithm reads all data pages before doing actual scan,
-        * so it's a startup cost. Again, we haven't any useful stats here, so,
-        * estimate it as proportion
+        * so it's a startup cost.  Again, we haven't any useful stats here, so
+        * estimate it as proportion.
          */
-       dataPagesFetched = ceil(numDataPages * counts.partialEntries / numEntries);
+       dataPagesFetched = ceil(numDataPages * partialScale);
  
         /*
          * Calculate cache effects if more than one scan due to nestloops or array
@@ -7110,30 +7477,30 @@ gincostestimate(PG_FUNCTION_ARGS)
         *indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost;
  
         /*
-        * Now we compute the number of data pages fetched while the scan proceeds.
+        * Now compute the number of data pages fetched during the scan.
+        *
+        * We assume every entry to have the same number of items, and that there
+        * is no overlap between them. (XXX: tsvector and array opclasses collect
+        * statistics on the frequency of individual keys; it would be nice to use
+        * those here.)
          */
-
-       /* data pages scanned for each exact (non-partial) matched entry */
         dataPagesFetched = ceil(numDataPages * counts.exactEntries / numEntries);
  
         /*
-        * Estimate number of data pages read, using selectivity estimation and
-        * capacity of data page.
+        * If there is a lot of overlap among the entries, in particular if one of
+        * the entries is very frequent, the above calculation can grossly
+        * under-estimate.  As a simple cross-check, calculate a lower bound based
+        * on the overall selectivity of the quals.  At a minimum, we must read
+        * one item pointer for each matching entry.
+        *
+        * The width of each item pointer varies, based on the level of
+        * compression.  We don't have statistics on that, but an average of
+        * around 3 bytes per item is fairly typical.
          */
         dataPagesFetchedBySel = ceil(*indexSelectivity *
-                                                                (numTuples / (BLCKSZ / SizeOfIptrData)));
-
+                                                                (numTuples / (BLCKSZ / 3)));
         if (dataPagesFetchedBySel > dataPagesFetched)
-       {
-               /*
-                * At least one of entries is very frequent and, unfortunately, we
-                * couldn't get statistic about entries (only tsvector has such
-                * statistics). So, we obviously have too small estimation of pages
-                * fetched from data tree. Re-estimate it from known capacity of data
-                * pages
-                */
                 dataPagesFetched = dataPagesFetchedBySel;
-       }
  
         /* Account for cache effects, the same as above */
         if (outer_scans > 1 || counts.arrayScans > 1)
@@ -7152,19 +7519,77 @@ gincostestimate(PG_FUNCTION_ARGS)
         /*
          * Add on index qual eval costs, much as in genericcostestimate
          */
-       cost_qual_eval(&index_qual_cost, indexQuals, root);
-       qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
-       cost_qual_eval(&index_qual_cost, indexOrderBys, root);
-       qual_arg_cost += index_qual_cost.startup + index_qual_cost.per_tuple;
+       qual_arg_cost = other_operands_eval_cost(root, qinfos) +
+               orderby_operands_eval_cost(root, path);
+       qual_op_cost = cpu_operator_cost *
+               (list_length(indexQuals) + list_length(indexOrderBys));
+
+       *indexStartupCost += qual_arg_cost;
+       *indexTotalCost += qual_arg_cost;
+       *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+       *indexPages = dataPagesFetched;
+}
+
+/*
+ * BRIN has search behavior completely different from other index types
+ */
+void
+brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+                                Cost *indexStartupCost, Cost *indexTotalCost,
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
+{
+       IndexOptInfo *index = path->indexinfo;
+       List       *indexQuals = path->indexquals;
+       List       *indexOrderBys = path->indexorderbys;
+       double          numPages = index->pages;
+       double          numTuples = index->tuples;
+       List       *qinfos;
+       Cost            spc_seq_page_cost;
+       Cost            spc_random_page_cost;
+       double          qual_op_cost;
+       double          qual_arg_cost;
+
+       /* Do preliminary analysis of indexquals */
+       qinfos = deconstruct_indexquals(path);
+
+       /* fetch estimated page cost for tablespace containing index */
+       get_tablespace_page_costs(index->reltablespace,
+                                                         &spc_random_page_cost,
+                                                         &spc_seq_page_cost);
+
+       /*
+        * BRIN indexes are always read in full; use that as startup cost.
+        *
+        * XXX maybe only include revmap pages here?
+        */
+       *indexStartupCost = spc_seq_page_cost * numPages * loop_count;
+
+       /*
+        * To read a BRIN index there might be a bit of back and forth over
+        * regular pages, as revmap might point to them out of sequential order;
+        * calculate this as reading the whole index in random order.
+        */
+       *indexTotalCost = spc_random_page_cost * numPages * loop_count;
+
+       *indexSelectivity =
+               clauselist_selectivity(root, indexQuals,
+                                                          path->indexinfo->rel->relid,
+                                                          JOIN_INNER, NULL);
+       *indexCorrelation = 1;
+
+       /*
+        * Add on index qual eval costs, much as in genericcostestimate.
+        */
+       qual_arg_cost = other_operands_eval_cost(root, qinfos) +
+               orderby_operands_eval_cost(root, path);
         qual_op_cost = cpu_operator_cost *
                 (list_length(indexQuals) + list_length(indexOrderBys));
-       qual_arg_cost -= qual_op_cost;
-       if (qual_arg_cost < 0)          /* just in case... */
-               qual_arg_cost = 0;
  
         *indexStartupCost += qual_arg_cost;
         *indexTotalCost += qual_arg_cost;
         *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+       *indexPages = index->pages;
  
-       PG_RETURN_VOID();
+       /* XXX what about pages_per_range? */
  }