Add support for EUI-64 MAC addresses as macaddr8

[postgresql] / src / backend / utils / adt / selfuncs.c
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index d396ef142f9563ce5f620152983a873c082439c8..bb9a5446861bde72e1caee147ef7e0c382b8de16 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -10,7 +10,7 @@
   *       Index cost functions are located via the index AM's API struct,
   *       which is obtained from the handler function registered in pg_am.
   *
- * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@ -98,6 +98,7 @@
  #include "postgres.h"
  
  #include <ctype.h>
+#include <float.h>
  #include <math.h>
  
  #include "access/gin.h"
@@ -141,6 +142,7 @@
  #include "utils/timestamp.h"
  #include "utils/tqual.h"
  #include "utils/typcache.h"
+#include "utils/varlena.h"
  
  
  /* Hooks for plugins to get control when we ask for stats */
@@ -2511,10 +2513,24 @@ eqjoinsel_semi(Oid operator,
          * We can apply this clamping both with respect to the base relation from
          * which the join variable comes (if there is just one), and to the
          * immediate inner input relation of the current join.
+        *
+        * If we clamp, we can treat nd2 as being a non-default estimate; it's not
+        * great, maybe, but it didn't come out of nowhere either.  This is most
+        * helpful when the inner relation is empty and consequently has no stats.
          */
         if (vardata2->rel)
-               nd2 = Min(nd2, vardata2->rel->rows);
-       nd2 = Min(nd2, inner_rel->rows);
+       {
+               if (nd2 >= vardata2->rel->rows)
+               {
+                       nd2 = vardata2->rel->rows;
+                       isdefault2 = false;
+               }
+       }
+       if (nd2 >= inner_rel->rows)
+       {
+               nd2 = inner_rel->rows;
+               isdefault2 = false;
+       }
  
         if (HeapTupleIsValid(vardata1->statsTuple))
         {
@@ -3237,15 +3253,15 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
   *             restriction selectivity of the equality in the next step.
   *     4.  For Vars within a single source rel, we multiply together the numbers
   *             of values, clamp to the number of rows in the rel (divided by 10 if
- *             more than one Var), and then multiply by the selectivity of the
- *             restriction clauses for that rel.  When there's more than one Var,
- *             the initial product is probably too high (it's the worst case) but
- *             clamping to a fraction of the rel's rows seems to be a helpful
- *             heuristic for not letting the estimate get out of hand.  (The factor
- *             of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
- *             by the restriction selectivity is effectively assuming that the
- *             restriction clauses are independent of the grouping, which is a crummy
- *             assumption, but it's hard to do better.
+ *             more than one Var), and then multiply by a factor based on the
+ *             selectivity of the restriction clauses for that rel.  When there's
+ *             more than one Var, the initial product is probably too high (it's the
+ *             worst case) but clamping to a fraction of the rel's rows seems to be a
+ *             helpful heuristic for not letting the estimate get out of hand.  (The
+ *             factor of 10 is derived from pre-Postgres-7.4 practice.)  The factor
+ *             we multiply by to adjust for the restriction selectivity assumes that
+ *             the restriction clauses are independent of the grouping, which may not
+ *             be a valid assumption, but it's hard to do better.
   *     5.  If there are Vars from multiple rels, we repeat step 4 for each such
   *             rel, and multiply the results together.
   * Note that rels not containing grouped Vars are ignored completely, as are
@@ -3439,9 +3455,51 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
                                 reldistinct = clamp;
  
                         /*
-                        * Multiply by restriction selectivity.
+                        * Update the estimate based on the restriction selectivity,
+                        * guarding against division by zero when reldistinct is zero.
+                        * Also skip this if we know that we are returning all rows.
                          */
-                       reldistinct *= rel->rows / rel->tuples;
+                       if (reldistinct > 0 && rel->rows < rel->tuples)
+                       {
+                               /*
+                                * Given a table containing N rows with n distinct values in a
+                                * uniform distribution, if we select p rows at random then
+                                * the expected number of distinct values selected is
+                                *
+                                * n * (1 - product((N-N/n-i)/(N-i), i=0..p-1))
+                                *
+                                * = n * (1 - (N-N/n)! / (N-N/n-p)! * (N-p)! / N!)
+                                *
+                                * See "Approximating block accesses in database
+                                * organizations", S. B. Yao, Communications of the ACM,
+                                * Volume 20 Issue 4, April 1977 Pages 260-261.
+                                *
+                                * Alternatively, re-arranging the terms from the factorials,
+                                * this may be written as
+                                *
+                                * n * (1 - product((N-p-i)/(N-i), i=0..N/n-1))
+                                *
+                                * This form of the formula is more efficient to compute in
+                                * the common case where p is larger than N/n.  Additionally,
+                                * as pointed out by Dell'Era, if i << N for all terms in the
+                                * product, it can be approximated by
+                                *
+                                * n * (1 - ((N-p)/N)^(N/n))
+                                *
+                                * See "Expected distinct values when selecting from a bag
+                                * without replacement", Alberto Dell'Era,
+                                * http://www.adellera.it/investigations/distinct_balls/.
+                                *
+                                * The condition i << N is equivalent to n >> 1, so this is a
+                                * good approximation when the number of distinct values in
+                                * the table is large.  It turns out that this formula also
+                                * works well even when n is small.
+                                */
+                               reldistinct *=
+                                       (1 - pow((rel->tuples - rel->rows) / rel->tuples,
+                                                        rel->tuples / reldistinct));
+                       }
+                       reldistinct = clamp_row_est(reldistinct);
  
                         /*
                          * Update estimate of total distinct groups.
@@ -3540,8 +3598,11 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
          * XXX Possibly better way, but much more expensive: multiply by
          * selectivity of rel's restriction clauses that mention the target Var.
          */
-       if (vardata.rel)
+       if (vardata.rel && vardata.rel->tuples > 0)
+       {
                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
+               ndistinct = clamp_row_est(ndistinct);
+       }
  
         /*
          * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
@@ -3739,6 +3800,7 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
                 case INETOID:
                 case CIDROID:
                 case MACADDROID:
+               case MACADDR8OID:
                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
@@ -4152,31 +4214,17 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
                                  * average month length of 365.25/12.0 days.  Not too
                                  * accurate, but plenty good enough for our purposes.
                                  */
-#ifdef HAVE_INT64_TIMESTAMP
                                 return interval->time + interval->day * (double) USECS_PER_DAY +
                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
-#else
-                               return interval->time + interval->day * SECS_PER_DAY +
-                                       interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
-#endif
                         }
                 case RELTIMEOID:
-#ifdef HAVE_INT64_TIMESTAMP
                         return (DatumGetRelativeTime(value) * 1000000.0);
-#else
-                       return DatumGetRelativeTime(value);
-#endif
                 case TINTERVALOID:
                         {
                                 TimeInterval tinterval = DatumGetTimeInterval(value);
  
-#ifdef HAVE_INT64_TIMESTAMP
                                 if (tinterval->status != 0)
                                         return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
-#else
-                               if (tinterval->status != 0)
-                                       return tinterval->data[1] - tinterval->data[0];
-#endif
                                 return 0;               /* for lack of a better idea */
                         }
                 case TIMEOID:
@@ -4186,11 +4234,7 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
  
                                 /* use GMT-equivalent time */
-#ifdef HAVE_INT64_TIMESTAMP
                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
-#else
-                               return (double) (timetz->time + timetz->zone);
-#endif
                         }
         }
  
@@ -4269,7 +4313,7 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
                 return true;
         }
  
-       /* Ooops, clause has wrong structure (probably var op var) */
+       /* Oops, clause has wrong structure (probably var op var) */
         ReleaseVariableStats(*vardata);
         ReleaseVariableStats(rdata);
  
@@ -4693,6 +4737,7 @@ double
  get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  {
         double          stadistinct;
+       double          stanullfrac = 0.0;
         double          ntuples;
  
         *isdefault = false;
@@ -4700,7 +4745,8 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
         /*
          * Determine the stadistinct value to use.  There are cases where we can
          * get an estimate even without a pg_statistic entry, or can get a better
-        * value than is in pg_statistic.
+        * value than is in pg_statistic.  Grab stanullfrac too if we can find it
+        * (otherwise, assume no nulls, for lack of any better idea).
          */
         if (HeapTupleIsValid(vardata->statsTuple))
         {
@@ -4709,6 +4755,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  
                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
                 stadistinct = stats->stadistinct;
+               stanullfrac = stats->stanullfrac;
         }
         else if (vardata->vartype == BOOLOID)
         {
@@ -4732,7 +4779,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
                         {
                                 case ObjectIdAttributeNumber:
                                 case SelfItemPointerAttributeNumber:
-                                       stadistinct = -1.0; /* unique */
+                                       stadistinct = -1.0; /* unique (and all non null) */
                                         break;
                                 case TableOidAttributeNumber:
                                         stadistinct = 1.0;      /* only 1 value */
@@ -4754,10 +4801,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
          * If there is a unique index or DISTINCT clause for the variable, assume
          * it is unique no matter what pg_statistic says; the statistics could be
          * out of date, or we might have found a partial unique index that proves
-        * the var is unique for this query.
+        * the var is unique for this query.  However, we'd better still believe
+        * the null-fraction statistic.
          */
         if (vardata->isunique)
-               stadistinct = -1.0;
+               stadistinct = -1.0 * (1.0 - stanullfrac);
  
         /*
          * If we had an absolute estimate, use that.
@@ -5295,13 +5343,12 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
         }
         else
         {
-               bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
+               bytea      *bstr = DatumGetByteaPP(patt_const->constvalue);
  
-               pattlen = VARSIZE(bstr) - VARHDRSZ;
+               pattlen = VARSIZE_ANY_EXHDR(bstr);
                 patt = (char *) palloc(pattlen);
-               memcpy(patt, VARDATA(bstr), pattlen);
-               if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
-                       pfree(bstr);
+               memcpy(patt, VARDATA_ANY(bstr), pattlen);
+               Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
         }
  
         match = palloc(pattlen + 1);
@@ -5811,13 +5858,12 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
         }
         else if (datatype == BYTEAOID)
         {
-               bytea      *bstr = DatumGetByteaP(str_const->constvalue);
+               bytea      *bstr = DatumGetByteaPP(str_const->constvalue);
  
-               len = VARSIZE(bstr) - VARHDRSZ;
+               len = VARSIZE_ANY_EXHDR(bstr);
                 workstr = (char *) palloc(len);
-               memcpy(workstr, VARDATA(bstr), len);
-               if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
-                       pfree(bstr);
+               memcpy(workstr, VARDATA_ANY(bstr), len);
+               Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
                 cmpstr = str_const->constvalue;
         }
         else
@@ -6013,21 +6059,7 @@ string_to_bytea_const(const char *str, size_t str_len)
   *-------------------------------------------------------------------------
   */
  
-/*
- * deconstruct_indexquals is a simple function to examine the indexquals
- * attached to a proposed IndexPath.  It returns a list of IndexQualInfo
- * structs, one per qual expression.
- */
-typedef struct
-{
-       RestrictInfo *rinfo;            /* the indexqual itself */
-       int                     indexcol;               /* zero-based index column number */
-       bool            varonleft;              /* true if index column is on left of qual */
-       Oid                     clause_op;              /* qual's operator OID, if relevant */
-       Node       *other_operand;      /* non-index operand of qual's operator */
-} IndexQualInfo;
-
-static List *
+List *
  deconstruct_indexquals(IndexPath *path)
  {
         List       *result = NIL;
@@ -6037,14 +6069,13 @@ deconstruct_indexquals(IndexPath *path)
  
         forboth(lcc, path->indexquals, lci, path->indexqualcols)
         {
-               RestrictInfo *rinfo = (RestrictInfo *) lfirst(lcc);
+               RestrictInfo *rinfo = castNode(RestrictInfo, lfirst(lcc));
                 int                     indexcol = lfirst_int(lci);
                 Expr       *clause;
                 Node       *leftop,
                                    *rightop;
                 IndexQualInfo *qinfo;
  
-               Assert(IsA(rinfo, RestrictInfo));
                 clause = rinfo->clause;
  
                 qinfo = (IndexQualInfo *) palloc(sizeof(IndexQualInfo));
@@ -6177,35 +6208,7 @@ orderby_operands_eval_cost(PlannerInfo *root, IndexPath *path)
         return qual_arg_cost;
  }
  
-/*
- * genericcostestimate is a general-purpose estimator that can be used for
- * most index types.  In some cases we use genericcostestimate as the base
- * code and then incorporate additional index-type-specific knowledge in
- * the type-specific calling function.  To avoid code duplication, we make
- * genericcostestimate return a number of intermediate values as well as
- * its preliminary estimates of the output cost values.  The GenericCosts
- * struct includes all these values.
- *
- * Callers should initialize all fields of GenericCosts to zero.  In addition,
- * they can set numIndexTuples to some positive value if they have a better
- * than default way of estimating the number of leaf index tuples visited.
- */
-typedef struct
-{
-       /* These are the values the cost estimator must return to the planner */
-       Cost            indexStartupCost;               /* index-related startup cost */
-       Cost            indexTotalCost; /* total index-related scan cost */
-       Selectivity indexSelectivity;           /* selectivity of index */
-       double          indexCorrelation;               /* order correlation of index */
-
-       /* Intermediate values we obtain along the way */
-       double          numIndexPages;  /* number of leaf pages visited */
-       double          numIndexTuples; /* number of leaf tuples visited */
-       double          spc_random_page_cost;   /* relevant random_page_cost value */
-       double          num_sa_scans;   /* # indexscans from ScalarArrayOps */
-} GenericCosts;
-
-static void
+void
  genericcostestimate(PlannerInfo *root,
                                         IndexPath *path,
                                         double loop_count,
@@ -6449,7 +6452,8 @@ add_predicate_to_quals(IndexOptInfo *index, List *indexQuals)
  void
  btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                            Cost *indexStartupCost, Cost *indexTotalCost,
-                          Selectivity *indexSelectivity, double *indexCorrelation)
+                          Selectivity *indexSelectivity, double *indexCorrelation,
+                          double *indexPages)
  {
         IndexOptInfo *index = path->indexinfo;
         List       *qinfos;
@@ -6739,12 +6743,14 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
  void
  hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                                  Cost *indexStartupCost, Cost *indexTotalCost,
-                                Selectivity *indexSelectivity, double *indexCorrelation)
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
  {
         List       *qinfos;
         GenericCosts costs;
@@ -6785,12 +6791,14 @@ hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
  void
  gistcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                                  Cost *indexStartupCost, Cost *indexTotalCost,
-                                Selectivity *indexSelectivity, double *indexCorrelation)
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
  {
         IndexOptInfo *index = path->indexinfo;
         List       *qinfos;
@@ -6844,12 +6852,14 @@ gistcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
  void
  spgcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                                 Cost *indexStartupCost, Cost *indexTotalCost,
-                               Selectivity *indexSelectivity, double *indexCorrelation)
+                               Selectivity *indexSelectivity, double *indexCorrelation,
+                               double *indexPages)
  {
         IndexOptInfo *index = path->indexinfo;
         List       *qinfos;
@@ -6903,6 +6913,7 @@ spgcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         *indexTotalCost = costs.indexTotalCost;
         *indexSelectivity = costs.indexSelectivity;
         *indexCorrelation = costs.indexCorrelation;
+       *indexPages = costs.numIndexPages;
  }
  
  
@@ -7200,7 +7211,8 @@ gincost_scalararrayopexpr(PlannerInfo *root,
  void
  gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                                 Cost *indexStartupCost, Cost *indexTotalCost,
-                               Selectivity *indexSelectivity, double *indexCorrelation)
+                               Selectivity *indexSelectivity, double *indexCorrelation,
+                               double *indexPages)
  {
         IndexOptInfo *index = path->indexinfo;
         List       *indexQuals = path->indexquals;
@@ -7515,6 +7527,7 @@ gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         *indexStartupCost += qual_arg_cost;
         *indexTotalCost += qual_arg_cost;
         *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+       *indexPages = dataPagesFetched;
  }
  
  /*
@@ -7523,7 +7536,8 @@ gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
  void
  brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
                                  Cost *indexStartupCost, Cost *indexTotalCost,
-                                Selectivity *indexSelectivity, double *indexCorrelation)
+                                Selectivity *indexSelectivity, double *indexCorrelation,
+                                double *indexPages)
  {
         IndexOptInfo *index = path->indexinfo;
         List       *indexQuals = path->indexquals;
@@ -7575,6 +7589,7 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
         *indexStartupCost += qual_arg_cost;
         *indexTotalCost += qual_arg_cost;
         *indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
+       *indexPages = index->pages;
  
         /* XXX what about pages_per_range? */
  }