granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * selfuncs.c
   4  *        Selectivity functions and index cost estimation functions for
   5  *        standard operators and index access methods.
   6  *
   7  *        Selectivity routines are registered in the pg_operator catalog
   8  *        in the "oprrest" and "oprjoin" attributes.
   9  *
  10  *        Index cost functions are registered in the pg_am catalog
  11  *        in the "amcostestimate" attribute.
  12  *
  13  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  14  * Portions Copyright (c) 1994, Regents of the University of California
  15  *
  16  *
  17  * IDENTIFICATION
  18  *        $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.226 2007/02/19 07:03:31 tgl Exp $
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22
  23 /*----------
  24  * Operator selectivity estimation functions are called to estimate the
  25  * selectivity of WHERE clauses whose top-level operator is their operator.
  26  * We divide the problem into two cases:
  27  *              Restriction clause estimation: the clause involves vars of just
  28  *                      one relation.
  29  *              Join clause estimation: the clause involves vars of multiple rels.
  30  * Join selectivity estimation is far more difficult and usually less accurate
  31  * than restriction estimation.
  32  *
  33  * When dealing with the inner scan of a nestloop join, we consider the
  34  * join's joinclauses as restriction clauses for the inner relation, and
  35  * treat vars of the outer relation as parameters (a/k/a constants of unknown
  36  * values).  So, restriction estimators need to be able to accept an argument
  37  * telling which relation is to be treated as the variable.
  38  *
  39  * The call convention for a restriction estimator (oprrest function) is
  40  *
  41  *              Selectivity oprrest (PlannerInfo *root,
  42  *                                                       Oid operator,
  43  *                                                       List *args,
  44  *                                                       int varRelid);
  45  *
  46  * root: general information about the query (rtable and RelOptInfo lists
  47  * are particularly important for the estimator).
  48  * operator: OID of the specific operator in question.
  49  * args: argument list from the operator clause.
  50  * varRelid: if not zero, the relid (rtable index) of the relation to
  51  * be treated as the variable relation.  May be zero if the args list
  52  * is known to contain vars of only one relation.
  53  *
  54  * This is represented at the SQL level (in pg_proc) as
  55  *
  56  *              float8 oprrest (internal, oid, internal, int4);
  57  *
  58  * The call convention for a join estimator (oprjoin function) is similar
  59  * except that varRelid is not needed, and instead the join type is
  60  * supplied:
  61  *
  62  *              Selectivity oprjoin (PlannerInfo *root,
  63  *                                                       Oid operator,
  64  *                                                       List *args,
  65  *                                                       JoinType jointype);
  66  *
  67  *              float8 oprjoin (internal, oid, internal, int2);
  68  *
  69  * (We deliberately make the SQL signature different to facilitate
  70  * catching errors.)
  71  *----------
  72  */
  73
  74 #include "postgres.h"
  75
  76 #include <ctype.h>
  77 #include <math.h>
  78
  79 #include "catalog/pg_opfamily.h"
  80 #include "catalog/pg_statistic.h"
  81 #include "catalog/pg_type.h"
  82 #include "mb/pg_wchar.h"
  83 #include "nodes/makefuncs.h"
  84 #include "optimizer/clauses.h"
  85 #include "optimizer/cost.h"
  86 #include "optimizer/pathnode.h"
  87 #include "optimizer/paths.h"
  88 #include "optimizer/plancat.h"
  89 #include "optimizer/restrictinfo.h"
  90 #include "optimizer/var.h"
  91 #include "parser/parse_coerce.h"
  92 #include "parser/parse_expr.h"
  93 #include "parser/parsetree.h"
  94 #include "utils/builtins.h"
  95 #include "utils/date.h"
  96 #include "utils/datum.h"
  97 #include "utils/fmgroids.h"
  98 #include "utils/lsyscache.h"
  99 #include "utils/nabstime.h"
 100 #include "utils/pg_locale.h"
 101 #include "utils/selfuncs.h"
 102 #include "utils/syscache.h"
 103
 104
 105 static double ineq_histogram_selectivity(VariableStatData *vardata,
 106                                                    FmgrInfo *opproc, bool isgt,
 107                                                    Datum constval, Oid consttype);
 108 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 109                                   Datum lobound, Datum hibound, Oid boundstypid,
 110                                   double *scaledlobound, double *scaledhibound);
 111 static double convert_numeric_to_scalar(Datum value, Oid typid);
 112 static void convert_string_to_scalar(char *value,
 113                                                  double *scaledvalue,
 114                                                  char *lobound,
 115                                                  double *scaledlobound,
 116                                                  char *hibound,
 117                                                  double *scaledhibound);
 118 static void convert_bytea_to_scalar(Datum value,
 119                                                 double *scaledvalue,
 120                                                 Datum lobound,
 121                                                 double *scaledlobound,
 122                                                 Datum hibound,
 123                                                 double *scaledhibound);
 124 static double convert_one_string_to_scalar(char *value,
 125                                                          int rangelo, int rangehi);
 126 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 127                                                         int rangelo, int rangehi);
 128 static char *convert_string_datum(Datum value, Oid typid);
 129 static double convert_timevalue_to_scalar(Datum value, Oid typid);
 130 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
 131                                          Oid sortop, Datum *max);
 132 static Selectivity prefix_selectivity(VariableStatData *vardata,
 133                                    Oid vartype, Oid opfamily, Const *prefixcon);
 134 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
 135 static Datum string_to_datum(const char *str, Oid datatype);
 136 static Const *string_to_const(const char *str, Oid datatype);
 137 static Const *string_to_bytea_const(const char *str, size_t str_len);
 138
 139
 140 /*
 141  *              eqsel                   - Selectivity of "=" for any data types.
 142  *
 143  * Note: this routine is also used to estimate selectivity for some
 144  * operators that are not "=" but have comparable selectivity behavior,
 145  * such as "~=" (geometric approximate-match).  Even for "=", we must
 146  * keep in mind that the left and right datatypes may differ.
 147  */
 148 Datum
 149 eqsel(PG_FUNCTION_ARGS)
 150 {
 151         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 152         Oid                     operator = PG_GETARG_OID(1);
 153         List       *args = (List *) PG_GETARG_POINTER(2);
 154         int                     varRelid = PG_GETARG_INT32(3);
 155         VariableStatData vardata;
 156         Node       *other;
 157         bool            varonleft;
 158         Datum      *values;
 159         int                     nvalues;
 160         float4     *numbers;
 161         int                     nnumbers;
 162         double          selec;
 163
 164         /*
 165          * If expression is not variable = something or something = variable, then
 166          * punt and return a default estimate.
 167          */
 168         if (!get_restriction_variable(root, args, varRelid,
 169                                                                   &vardata, &other, &varonleft))
 170                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 171
 172         /*
 173          * If the something is a NULL constant, assume operator is strict and
 174          * return zero, ie, operator will never return TRUE.
 175          */
 176         if (IsA(other, Const) &&
 177                 ((Const *) other)->constisnull)
 178         {
 179                 ReleaseVariableStats(vardata);
 180                 PG_RETURN_FLOAT8(0.0);
 181         }
 182
 183         if (HeapTupleIsValid(vardata.statsTuple))
 184         {
 185                 Form_pg_statistic stats;
 186
 187                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 188
 189                 if (IsA(other, Const))
 190                 {
 191                         /* Variable is being compared to a known non-null constant */
 192                         Datum           constval = ((Const *) other)->constvalue;
 193                         bool            match = false;
 194                         int                     i;
 195
 196                         /*
 197                          * Is the constant "=" to any of the column's most common values?
 198                          * (Although the given operator may not really be "=", we will
 199                          * assume that seeing whether it returns TRUE is an appropriate
 200                          * test.  If you don't like this, maybe you shouldn't be using
 201                          * eqsel for your operator...)
 202                          */
 203                         if (get_attstatsslot(vardata.statsTuple,
 204                                                                  vardata.atttype, vardata.atttypmod,
 205                                                                  STATISTIC_KIND_MCV, InvalidOid,
 206                                                                  &values, &nvalues,
 207                                                                  &numbers, &nnumbers))
 208                         {
 209                                 FmgrInfo        eqproc;
 210
 211                                 fmgr_info(get_opcode(operator), &eqproc);
 212
 213                                 for (i = 0; i < nvalues; i++)
 214                                 {
 215                                         /* be careful to apply operator right way 'round */
 216                                         if (varonleft)
 217                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 218                                                                                                                    values[i],
 219                                                                                                                    constval));
 220                                         else
 221                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 222                                                                                                                    constval,
 223                                                                                                                    values[i]));
 224                                         if (match)
 225                                                 break;
 226                                 }
 227                         }
 228                         else
 229                         {
 230                                 /* no most-common-value info available */
 231                                 values = NULL;
 232                                 numbers = NULL;
 233                                 i = nvalues = nnumbers = 0;
 234                         }
 235
 236                         if (match)
 237                         {
 238                                 /*
 239                                  * Constant is "=" to this common value.  We know selectivity
 240                                  * exactly (or as exactly as ANALYZE could calculate it,
 241                                  * anyway).
 242                                  */
 243                                 selec = numbers[i];
 244                         }
 245                         else
 246                         {
 247                                 /*
 248                                  * Comparison is against a constant that is neither NULL nor
 249                                  * any of the common values.  Its selectivity cannot be more
 250                                  * than this:
 251                                  */
 252                                 double          sumcommon = 0.0;
 253                                 double          otherdistinct;
 254
 255                                 for (i = 0; i < nnumbers; i++)
 256                                         sumcommon += numbers[i];
 257                                 selec = 1.0 - sumcommon - stats->stanullfrac;
 258                                 CLAMP_PROBABILITY(selec);
 259
 260                                 /*
 261                                  * and in fact it's probably a good deal less. We approximate
 262                                  * that all the not-common values share this remaining
 263                                  * fraction equally, so we divide by the number of other
 264                                  * distinct values.
 265                                  */
 266                                 otherdistinct = get_variable_numdistinct(&vardata)
 267                                         - nnumbers;
 268                                 if (otherdistinct > 1)
 269                                         selec /= otherdistinct;
 270
 271                                 /*
 272                                  * Another cross-check: selectivity shouldn't be estimated as
 273                                  * more than the least common "most common value".
 274                                  */
 275                                 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
 276                                         selec = numbers[nnumbers - 1];
 277                         }
 278
 279                         free_attstatsslot(vardata.atttype, values, nvalues,
 280                                                           numbers, nnumbers);
 281                 }
 282                 else
 283                 {
 284                         double          ndistinct;
 285
 286                         /*
 287                          * Search is for a value that we do not know a priori, but we will
 288                          * assume it is not NULL.  Estimate the selectivity as non-null
 289                          * fraction divided by number of distinct values, so that we get a
 290                          * result averaged over all possible values whether common or
 291                          * uncommon.  (Essentially, we are assuming that the not-yet-known
 292                          * comparison value is equally likely to be any of the possible
 293                          * values, regardless of their frequency in the table.  Is that a
 294                          * good idea?)
 295                          */
 296                         selec = 1.0 - stats->stanullfrac;
 297                         ndistinct = get_variable_numdistinct(&vardata);
 298                         if (ndistinct > 1)
 299                                 selec /= ndistinct;
 300
 301                         /*
 302                          * Cross-check: selectivity should never be estimated as more than
 303                          * the most common value's.
 304                          */
 305                         if (get_attstatsslot(vardata.statsTuple,
 306                                                                  vardata.atttype, vardata.atttypmod,
 307                                                                  STATISTIC_KIND_MCV, InvalidOid,
 308                                                                  NULL, NULL,
 309                                                                  &numbers, &nnumbers))
 310                         {
 311                                 if (nnumbers > 0 && selec > numbers[0])
 312                                         selec = numbers[0];
 313                                 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
 314                         }
 315                 }
 316         }
 317         else
 318         {
 319                 /*
 320                  * No ANALYZE stats available, so make a guess using estimated number
 321                  * of distinct values and assuming they are equally common. (The guess
 322                  * is unlikely to be very good, but we do know a few special cases.)
 323                  */
 324                 selec = 1.0 / get_variable_numdistinct(&vardata);
 325         }
 326
 327         ReleaseVariableStats(vardata);
 328
 329         /* result should be in range, but make sure... */
 330         CLAMP_PROBABILITY(selec);
 331
 332         PG_RETURN_FLOAT8((float8) selec);
 333 }
 334
 335 /*
 336  *              neqsel                  - Selectivity of "!=" for any data types.
 337  *
 338  * This routine is also used for some operators that are not "!="
 339  * but have comparable selectivity behavior.  See above comments
 340  * for eqsel().
 341  */
 342 Datum
 343 neqsel(PG_FUNCTION_ARGS)
 344 {
 345         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 346         Oid                     operator = PG_GETARG_OID(1);
 347         List       *args = (List *) PG_GETARG_POINTER(2);
 348         int                     varRelid = PG_GETARG_INT32(3);
 349         Oid                     eqop;
 350         float8          result;
 351
 352         /*
 353          * We want 1 - eqsel() where the equality operator is the one associated
 354          * with this != operator, that is, its negator.
 355          */
 356         eqop = get_negator(operator);
 357         if (eqop)
 358         {
 359                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 360                                                                                                         PointerGetDatum(root),
 361                                                                                                         ObjectIdGetDatum(eqop),
 362                                                                                                         PointerGetDatum(args),
 363                                                                                                         Int32GetDatum(varRelid)));
 364         }
 365         else
 366         {
 367                 /* Use default selectivity (should we raise an error instead?) */
 368                 result = DEFAULT_EQ_SEL;
 369         }
 370         result = 1.0 - result;
 371         PG_RETURN_FLOAT8(result);
 372 }
 373
 374 /*
 375  *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
 376  *
 377  * This is the guts of both scalarltsel and scalargtsel.  The caller has
 378  * commuted the clause, if necessary, so that we can treat the variable as
 379  * being on the left.  The caller must also make sure that the other side
 380  * of the clause is a non-null Const, and dissect same into a value and
 381  * datatype.
 382  *
 383  * This routine works for any datatype (or pair of datatypes) known to
 384  * convert_to_scalar().  If it is applied to some other datatype,
 385  * it will return a default estimate.
 386  */
 387 static double
 388 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 389                           VariableStatData *vardata, Datum constval, Oid consttype)
 390 {
 391         Form_pg_statistic stats;
 392         FmgrInfo        opproc;
 393         double          mcv_selec,
 394                                 hist_selec,
 395                                 sumcommon;
 396         double          selec;
 397
 398         if (!HeapTupleIsValid(vardata->statsTuple))
 399         {
 400                 /* no stats available, so default result */
 401                 return DEFAULT_INEQ_SEL;
 402         }
 403         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 404
 405         fmgr_info(get_opcode(operator), &opproc);
 406
 407         /*
 408          * If we have most-common-values info, add up the fractions of the MCV
 409          * entries that satisfy MCV OP CONST.  These fractions contribute directly
 410          * to the result selectivity.  Also add up the total fraction represented
 411          * by MCV entries.
 412          */
 413         mcv_selec = mcv_selectivity(vardata, &opproc, constval, true,
 414                                                                 &sumcommon);
 415
 416         /*
 417          * If there is a histogram, determine which bin the constant falls in, and
 418          * compute the resulting contribution to selectivity.
 419          */
 420         hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt,
 421                                                                                         constval, consttype);
 422
 423         /*
 424          * Now merge the results from the MCV and histogram calculations,
 425          * realizing that the histogram covers only the non-null values that are
 426          * not listed in MCV.
 427          */
 428         selec = 1.0 - stats->stanullfrac - sumcommon;
 429
 430         if (hist_selec > 0.0)
 431                 selec *= hist_selec;
 432         else
 433         {
 434                 /*
 435                  * If no histogram but there are values not accounted for by MCV,
 436                  * arbitrarily assume half of them will match.
 437                  */
 438                 selec *= 0.5;
 439         }
 440
 441         selec += mcv_selec;
 442
 443         /* result should be in range, but make sure... */
 444         CLAMP_PROBABILITY(selec);
 445
 446         return selec;
 447 }
 448
 449 /*
 450  *      mcv_selectivity                 - Examine the MCV list for selectivity estimates
 451  *
 452  * Determine the fraction of the variable's MCV population that satisfies
 453  * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.  Also
 454  * compute the fraction of the total column population represented by the MCV
 455  * list.  This code will work for any boolean-returning predicate operator.
 456  *
 457  * The function result is the MCV selectivity, and the fraction of the
 458  * total population is returned into *sumcommonp.  Zeroes are returned
 459  * if there is no MCV list.
 460  */
 461 double
 462 mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 463                                 Datum constval, bool varonleft,
 464                                 double *sumcommonp)
 465 {
 466         double          mcv_selec,
 467                                 sumcommon;
 468         Datum      *values;
 469         int                     nvalues;
 470         float4     *numbers;
 471         int                     nnumbers;
 472         int                     i;
 473
 474         mcv_selec = 0.0;
 475         sumcommon = 0.0;
 476
 477         if (HeapTupleIsValid(vardata->statsTuple) &&
 478                 get_attstatsslot(vardata->statsTuple,
 479                                                  vardata->atttype, vardata->atttypmod,
 480                                                  STATISTIC_KIND_MCV, InvalidOid,
 481                                                  &values, &nvalues,
 482                                                  &numbers, &nnumbers))
 483         {
 484                 for (i = 0; i < nvalues; i++)
 485                 {
 486                         if (varonleft ?
 487                                 DatumGetBool(FunctionCall2(opproc,
 488                                                                                    values[i],
 489                                                                                    constval)) :
 490                                 DatumGetBool(FunctionCall2(opproc,
 491                                                                                    constval,
 492                                                                                    values[i])))
 493                                 mcv_selec += numbers[i];
 494                         sumcommon += numbers[i];
 495                 }
 496                 free_attstatsslot(vardata->atttype, values, nvalues,
 497                                                   numbers, nnumbers);
 498         }
 499
 500         *sumcommonp = sumcommon;
 501         return mcv_selec;
 502 }
 503
 504 /*
 505  *      histogram_selectivity   - Examine the histogram for selectivity estimates
 506  *
 507  * Determine the fraction of the variable's histogram entries that satisfy
 508  * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
 509  *
 510  * This code will work for any boolean-returning predicate operator, whether
 511  * or not it has anything to do with the histogram sort operator.  We are
 512  * essentially using the histogram just as a representative sample.  However,
 513  * small histograms are unlikely to be all that representative, so the caller
 514  * should specify a minimum histogram size to use, and fall back on some
 515  * other approach if this routine fails.
 516  *
 517  * The caller also specifies n_skip, which causes us to ignore the first and
 518  * last n_skip histogram elements, on the grounds that they are outliers and
 519  * hence not very representative.  If in doubt, min_hist_size = 100 and
 520  * n_skip = 1 are reasonable values.
 521  *
 522  * The function result is the selectivity, or -1 if there is no histogram
 523  * or it's smaller than min_hist_size.
 524  *
 525  * Note that the result disregards both the most-common-values (if any) and
 526  * null entries.  The caller is expected to combine this result with
 527  * statistics for those portions of the column population.      It may also be
 528  * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
 529  */
 530 double
 531 histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 532                                           Datum constval, bool varonleft,
 533                                           int min_hist_size, int n_skip)
 534 {
 535         double          result;
 536         Datum      *values;
 537         int                     nvalues;
 538
 539         /* check sanity of parameters */
 540         Assert(n_skip >= 0);
 541         Assert(min_hist_size > 2 * n_skip);
 542
 543         if (HeapTupleIsValid(vardata->statsTuple) &&
 544                 get_attstatsslot(vardata->statsTuple,
 545                                                  vardata->atttype, vardata->atttypmod,
 546                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 547                                                  &values, &nvalues,
 548                                                  NULL, NULL))
 549         {
 550                 if (nvalues >= min_hist_size)
 551                 {
 552                         int                     nmatch = 0;
 553                         int                     i;
 554
 555                         for (i = n_skip; i < nvalues - n_skip; i++)
 556                         {
 557                                 if (varonleft ?
 558                                         DatumGetBool(FunctionCall2(opproc,
 559                                                                                            values[i],
 560                                                                                            constval)) :
 561                                         DatumGetBool(FunctionCall2(opproc,
 562                                                                                            constval,
 563                                                                                            values[i])))
 564                                         nmatch++;
 565                         }
 566                         result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
 567                 }
 568                 else
 569                         result = -1;
 570                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 571         }
 572         else
 573                 result = -1;
 574
 575         return result;
 576 }
 577
 578 /*
 579  *      ineq_histogram_selectivity      - Examine the histogram for scalarineqsel
 580  *
 581  * Determine the fraction of the variable's histogram population that
 582  * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
 583  *
 584  * Returns zero if there is no histogram (valid results will always be
 585  * greater than zero).
 586  *
 587  * Note that the result disregards both the most-common-values (if any) and
 588  * null entries.  The caller is expected to combine this result with
 589  * statistics for those portions of the column population.
 590  */
 591 static double
 592 ineq_histogram_selectivity(VariableStatData *vardata,
 593                                                    FmgrInfo *opproc, bool isgt,
 594                                                    Datum constval, Oid consttype)
 595 {
 596         double          hist_selec;
 597         Datum      *values;
 598         int                     nvalues;
 599
 600         hist_selec = 0.0;
 601
 602         /*
 603          * Someday, ANALYZE might store more than one histogram per rel/att,
 604          * corresponding to more than one possible sort ordering defined for the
 605          * column type.  However, to make that work we will need to figure out
 606          * which staop to search for --- it's not necessarily the one we have at
 607          * hand!  (For example, we might have a '<=' operator rather than the '<'
 608          * operator that will appear in staop.)  For now, assume that whatever
 609          * appears in pg_statistic is sorted the same way our operator sorts, or
 610          * the reverse way if isgt is TRUE.
 611          */
 612         if (HeapTupleIsValid(vardata->statsTuple) &&
 613                 get_attstatsslot(vardata->statsTuple,
 614                                                  vardata->atttype, vardata->atttypmod,
 615                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 616                                                  &values, &nvalues,
 617                                                  NULL, NULL))
 618         {
 619                 if (nvalues > 1)
 620                 {
 621                         /*
 622                          * Use binary search to find proper location, ie, the first slot
 623                          * at which the comparison fails.  (If the given operator isn't
 624                          * actually sort-compatible with the histogram, you'll get garbage
 625                          * results ... but probably not any more garbage-y than you would
 626                          * from the old linear search.)
 627                          */
 628                         double          histfrac;
 629                         int                     lobound = 0;    /* first possible slot to search */
 630                         int                     hibound = nvalues;              /* last+1 slot to search */
 631
 632                         while (lobound < hibound)
 633                         {
 634                                 int                     probe = (lobound + hibound) / 2;
 635                                 bool            ltcmp;
 636
 637                                 ltcmp = DatumGetBool(FunctionCall2(opproc,
 638                                                                                                    values[probe],
 639                                                                                                    constval));
 640                                 if (isgt)
 641                                         ltcmp = !ltcmp;
 642                                 if (ltcmp)
 643                                         lobound = probe + 1;
 644                                 else
 645                                         hibound = probe;
 646                         }
 647
 648                         if (lobound <= 0)
 649                         {
 650                                 /* Constant is below lower histogram boundary. */
 651                                 histfrac = 0.0;
 652                         }
 653                         else if (lobound >= nvalues)
 654                         {
 655                                 /* Constant is above upper histogram boundary. */
 656                                 histfrac = 1.0;
 657                         }
 658                         else
 659                         {
 660                                 int                     i = lobound;
 661                                 double          val,
 662                                                         high,
 663                                                         low;
 664                                 double          binfrac;
 665
 666                                 /*
 667                                  * We have values[i-1] < constant < values[i].
 668                                  *
 669                                  * Convert the constant and the two nearest bin boundary
 670                                  * values to a uniform comparison scale, and do a linear
 671                                  * interpolation within this bin.
 672                                  */
 673                                 if (convert_to_scalar(constval, consttype, &val,
 674                                                                           values[i - 1], values[i],
 675                                                                           vardata->vartype,
 676                                                                           &low, &high))
 677                                 {
 678                                         if (high <= low)
 679                                         {
 680                                                 /* cope if bin boundaries appear identical */
 681                                                 binfrac = 0.5;
 682                                         }
 683                                         else if (val <= low)
 684                                                 binfrac = 0.0;
 685                                         else if (val >= high)
 686                                                 binfrac = 1.0;
 687                                         else
 688                                         {
 689                                                 binfrac = (val - low) / (high - low);
 690
 691                                                 /*
 692                                                  * Watch out for the possibility that we got a NaN or
 693                                                  * Infinity from the division.  This can happen
 694                                                  * despite the previous checks, if for example "low"
 695                                                  * is -Infinity.
 696                                                  */
 697                                                 if (isnan(binfrac) ||
 698                                                         binfrac < 0.0 || binfrac > 1.0)
 699                                                         binfrac = 0.5;
 700                                         }
 701                                 }
 702                                 else
 703                                 {
 704                                         /*
 705                                          * Ideally we'd produce an error here, on the grounds that
 706                                          * the given operator shouldn't have scalarXXsel
 707                                          * registered as its selectivity func unless we can deal
 708                                          * with its operand types.      But currently, all manner of
 709                                          * stuff is invoking scalarXXsel, so give a default
 710                                          * estimate until that can be fixed.
 711                                          */
 712                                         binfrac = 0.5;
 713                                 }
 714
 715                                 /*
 716                                  * Now, compute the overall selectivity across the values
 717                                  * represented by the histogram.  We have i-1 full bins and
 718                                  * binfrac partial bin below the constant.
 719                                  */
 720                                 histfrac = (double) (i - 1) + binfrac;
 721                                 histfrac /= (double) (nvalues - 1);
 722                         }
 723
 724                         /*
 725                          * Now histfrac = fraction of histogram entries below the
 726                          * constant.
 727                          *
 728                          * Account for "<" vs ">"
 729                          */
 730                         hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 731
 732                         /*
 733                          * The histogram boundaries are only approximate to begin with,
 734                          * and may well be out of date anyway.  Therefore, don't believe
 735                          * extremely small or large selectivity estimates.
 736                          */
 737                         if (hist_selec < 0.0001)
 738                                 hist_selec = 0.0001;
 739                         else if (hist_selec > 0.9999)
 740                                 hist_selec = 0.9999;
 741                 }
 742
 743                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 744         }
 745
 746         return hist_selec;
 747 }
 748
 749 /*
 750  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 751  */
 752 Datum
 753 scalarltsel(PG_FUNCTION_ARGS)
 754 {
 755         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 756         Oid                     operator = PG_GETARG_OID(1);
 757         List       *args = (List *) PG_GETARG_POINTER(2);
 758         int                     varRelid = PG_GETARG_INT32(3);
 759         VariableStatData vardata;
 760         Node       *other;
 761         bool            varonleft;
 762         Datum           constval;
 763         Oid                     consttype;
 764         bool            isgt;
 765         double          selec;
 766
 767         /*
 768          * If expression is not variable op something or something op variable,
 769          * then punt and return a default estimate.
 770          */
 771         if (!get_restriction_variable(root, args, varRelid,
 772                                                                   &vardata, &other, &varonleft))
 773                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 774
 775         /*
 776          * Can't do anything useful if the something is not a constant, either.
 777          */
 778         if (!IsA(other, Const))
 779         {
 780                 ReleaseVariableStats(vardata);
 781                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 782         }
 783
 784         /*
 785          * If the constant is NULL, assume operator is strict and return zero, ie,
 786          * operator will never return TRUE.
 787          */
 788         if (((Const *) other)->constisnull)
 789         {
 790                 ReleaseVariableStats(vardata);
 791                 PG_RETURN_FLOAT8(0.0);
 792         }
 793         constval = ((Const *) other)->constvalue;
 794         consttype = ((Const *) other)->consttype;
 795
 796         /*
 797          * Force the var to be on the left to simplify logic in scalarineqsel.
 798          */
 799         if (varonleft)
 800         {
 801                 /* we have var < other */
 802                 isgt = false;
 803         }
 804         else
 805         {
 806                 /* we have other < var, commute to make var > other */
 807                 operator = get_commutator(operator);
 808                 if (!operator)
 809                 {
 810                         /* Use default selectivity (should we raise an error instead?) */
 811                         ReleaseVariableStats(vardata);
 812                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 813                 }
 814                 isgt = true;
 815         }
 816
 817         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 818
 819         ReleaseVariableStats(vardata);
 820
 821         PG_RETURN_FLOAT8((float8) selec);
 822 }
 823
 824 /*
 825  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 826  */
 827 Datum
 828 scalargtsel(PG_FUNCTION_ARGS)
 829 {
 830         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 831         Oid                     operator = PG_GETARG_OID(1);
 832         List       *args = (List *) PG_GETARG_POINTER(2);
 833         int                     varRelid = PG_GETARG_INT32(3);
 834         VariableStatData vardata;
 835         Node       *other;
 836         bool            varonleft;
 837         Datum           constval;
 838         Oid                     consttype;
 839         bool            isgt;
 840         double          selec;
 841
 842         /*
 843          * If expression is not variable op something or something op variable,
 844          * then punt and return a default estimate.
 845          */
 846         if (!get_restriction_variable(root, args, varRelid,
 847                                                                   &vardata, &other, &varonleft))
 848                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 849
 850         /*
 851          * Can't do anything useful if the something is not a constant, either.
 852          */
 853         if (!IsA(other, Const))
 854         {
 855                 ReleaseVariableStats(vardata);
 856                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 857         }
 858
 859         /*
 860          * If the constant is NULL, assume operator is strict and return zero, ie,
 861          * operator will never return TRUE.
 862          */
 863         if (((Const *) other)->constisnull)
 864         {
 865                 ReleaseVariableStats(vardata);
 866                 PG_RETURN_FLOAT8(0.0);
 867         }
 868         constval = ((Const *) other)->constvalue;
 869         consttype = ((Const *) other)->consttype;
 870
 871         /*
 872          * Force the var to be on the left to simplify logic in scalarineqsel.
 873          */
 874         if (varonleft)
 875         {
 876                 /* we have var > other */
 877                 isgt = true;
 878         }
 879         else
 880         {
 881                 /* we have other > var, commute to make var < other */
 882                 operator = get_commutator(operator);
 883                 if (!operator)
 884                 {
 885                         /* Use default selectivity (should we raise an error instead?) */
 886                         ReleaseVariableStats(vardata);
 887                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 888                 }
 889                 isgt = false;
 890         }
 891
 892         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 893
 894         ReleaseVariableStats(vardata);
 895
 896         PG_RETURN_FLOAT8((float8) selec);
 897 }
 898
 899 /*
 900  * patternsel                   - Generic code for pattern-match selectivity.
 901  */
 902 static double
 903 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 904 {
 905         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 906         Oid                     operator = PG_GETARG_OID(1);
 907         List       *args = (List *) PG_GETARG_POINTER(2);
 908         int                     varRelid = PG_GETARG_INT32(3);
 909         VariableStatData vardata;
 910         Node       *variable;
 911         Node       *other;
 912         bool            varonleft;
 913         Datum           constval;
 914         Oid                     consttype;
 915         Oid                     vartype;
 916         Oid                     opfamily;
 917         Pattern_Prefix_Status pstatus;
 918         Const      *patt = NULL;
 919         Const      *prefix = NULL;
 920         Const      *rest = NULL;
 921         double          result;
 922
 923         /*
 924          * If expression is not variable op constant, then punt and return a
 925          * default estimate.
 926          */
 927         if (!get_restriction_variable(root, args, varRelid,
 928                                                                   &vardata, &other, &varonleft))
 929                 return DEFAULT_MATCH_SEL;
 930         if (!varonleft || !IsA(other, Const))
 931         {
 932                 ReleaseVariableStats(vardata);
 933                 return DEFAULT_MATCH_SEL;
 934         }
 935         variable = (Node *) linitial(args);
 936
 937         /*
 938          * If the constant is NULL, assume operator is strict and return zero, ie,
 939          * operator will never return TRUE.
 940          */
 941         if (((Const *) other)->constisnull)
 942         {
 943                 ReleaseVariableStats(vardata);
 944                 return 0.0;
 945         }
 946         constval = ((Const *) other)->constvalue;
 947         consttype = ((Const *) other)->consttype;
 948
 949         /*
 950          * The right-hand const is type text or bytea for all supported operators.
 951          * We do not expect to see binary-compatible types here, since
 952          * const-folding should have relabeled the const to exactly match the
 953          * operator's declared type.
 954          */
 955         if (consttype != TEXTOID && consttype != BYTEAOID)
 956         {
 957                 ReleaseVariableStats(vardata);
 958                 return DEFAULT_MATCH_SEL;
 959         }
 960
 961         /*
 962          * Similarly, the exposed type of the left-hand side should be one of
 963          * those we know.  (Do not look at vardata.atttype, which might be
 964          * something binary-compatible but different.)  We can use it to choose
 965          * the index opfamily from which we must draw the comparison operators.
 966          *
 967          * NOTE: It would be more correct to use the PATTERN opfamilies than the
 968          * simple ones, but at the moment ANALYZE will not generate statistics for
 969          * the PATTERN operators.  But our results are so approximate anyway that
 970          * it probably hardly matters.
 971          */
 972         vartype = vardata.vartype;
 973
 974         switch (vartype)
 975         {
 976                 case TEXTOID:
 977                         opfamily = TEXT_BTREE_FAM_OID;
 978                         break;
 979                 case BPCHAROID:
 980                         opfamily = BPCHAR_BTREE_FAM_OID;
 981                         break;
 982                 case NAMEOID:
 983                         opfamily = NAME_BTREE_FAM_OID;
 984                         break;
 985                 case BYTEAOID:
 986                         opfamily = BYTEA_BTREE_FAM_OID;
 987                         break;
 988                 default:
 989                         ReleaseVariableStats(vardata);
 990                         return DEFAULT_MATCH_SEL;
 991         }
 992
 993         /* divide pattern into fixed prefix and remainder */
 994         patt = (Const *) other;
 995         pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
 996
 997         /*
 998          * If necessary, coerce the prefix constant to the right type. (The "rest"
 999          * constant need not be changed.)
1000          */
1001         if (prefix && prefix->consttype != vartype)
1002         {
1003                 char       *prefixstr;
1004
1005                 switch (prefix->consttype)
1006                 {
1007                         case TEXTOID:
1008                                 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
1009                                                                                                                 prefix->constvalue));
1010                                 break;
1011                         case BYTEAOID:
1012                                 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
1013                                                                                                                 prefix->constvalue));
1014                                 break;
1015                         default:
1016                                 elog(ERROR, "unrecognized consttype: %u",
1017                                          prefix->consttype);
1018                                 ReleaseVariableStats(vardata);
1019                                 return DEFAULT_MATCH_SEL;
1020                 }
1021                 prefix = string_to_const(prefixstr, vartype);
1022                 pfree(prefixstr);
1023         }
1024
1025         if (pstatus == Pattern_Prefix_Exact)
1026         {
1027                 /*
1028                  * Pattern specifies an exact match, so pretend operator is '='
1029                  */
1030                 Oid                     eqopr = get_opfamily_member(opfamily, vartype, vartype,
1031                                                                                                 BTEqualStrategyNumber);
1032                 List       *eqargs;
1033
1034                 if (eqopr == InvalidOid)
1035                         elog(ERROR, "no = operator for opfamily %u", opfamily);
1036                 eqargs = list_make2(variable, prefix);
1037                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
1038                                                                                                         PointerGetDatum(root),
1039                                                                                                         ObjectIdGetDatum(eqopr),
1040                                                                                                         PointerGetDatum(eqargs),
1041                                                                                                         Int32GetDatum(varRelid)));
1042         }
1043         else
1044         {
1045                 /*
1046                  * Not exact-match pattern.  If we have a sufficiently large
1047                  * histogram, estimate selectivity for the histogram part of the
1048                  * population by counting matches in the histogram.  If not, estimate
1049                  * selectivity of the fixed prefix and remainder of pattern
1050                  * separately, then combine the two to get an estimate of the
1051                  * selectivity for the part of the column population represented by
1052                  * the histogram.  We then add up data for any most-common-values
1053                  * values; these are not in the histogram population, and we can get
1054                  * exact answers for them by applying the pattern operator, so there's
1055                  * no reason to approximate.  (If the MCVs cover a significant part of
1056                  * the total population, this gives us a big leg up in accuracy.)
1057                  */
1058                 Selectivity selec;
1059                 FmgrInfo        opproc;
1060                 double          nullfrac,
1061                                         mcv_selec,
1062                                         sumcommon;
1063
1064                 /* Try to use the histogram entries to get selectivity */
1065                 fmgr_info(get_opcode(operator), &opproc);
1066
1067                 selec = histogram_selectivity(&vardata, &opproc, constval, true,
1068                                                                           100, 1);
1069                 if (selec < 0)
1070                 {
1071                         /* Nope, so fake it with the heuristic method */
1072                         Selectivity prefixsel;
1073                         Selectivity restsel;
1074
1075                         if (pstatus == Pattern_Prefix_Partial)
1076                                 prefixsel = prefix_selectivity(&vardata, vartype,
1077                                                                                            opfamily, prefix);
1078                         else
1079                                 prefixsel = 1.0;
1080                         restsel = pattern_selectivity(rest, ptype);
1081                         selec = prefixsel * restsel;
1082                 }
1083                 else
1084                 {
1085                         /* Yes, but don't believe extremely small or large estimates. */
1086                         if (selec < 0.0001)
1087                                 selec = 0.0001;
1088                         else if (selec > 0.9999)
1089                                 selec = 0.9999;
1090                 }
1091
1092                 /*
1093                  * If we have most-common-values info, add up the fractions of the MCV
1094                  * entries that satisfy MCV OP PATTERN.  These fractions contribute
1095                  * directly to the result selectivity.  Also add up the total fraction
1096                  * represented by MCV entries.
1097                  */
1098                 mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
1099                                                                         &sumcommon);
1100
1101                 if (HeapTupleIsValid(vardata.statsTuple))
1102                         nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
1103                 else
1104                         nullfrac = 0.0;
1105
1106                 /*
1107                  * Now merge the results from the MCV and histogram calculations,
1108                  * realizing that the histogram covers only the non-null values that
1109                  * are not listed in MCV.
1110                  */
1111                 selec *= 1.0 - nullfrac - sumcommon;
1112                 selec += mcv_selec;
1113
1114                 /* result should be in range, but make sure... */
1115                 CLAMP_PROBABILITY(selec);
1116                 result = selec;
1117         }
1118
1119         if (prefix)
1120         {
1121                 pfree(DatumGetPointer(prefix->constvalue));
1122                 pfree(prefix);
1123         }
1124
1125         ReleaseVariableStats(vardata);
1126
1127         return result;
1128 }
1129
1130 /*
1131  *              regexeqsel              - Selectivity of regular-expression pattern match.
1132  */
1133 Datum
1134 regexeqsel(PG_FUNCTION_ARGS)
1135 {
1136         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
1137 }
1138
1139 /*
1140  *              icregexeqsel    - Selectivity of case-insensitive regex match.
1141  */
1142 Datum
1143 icregexeqsel(PG_FUNCTION_ARGS)
1144 {
1145         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1146 }
1147
1148 /*
1149  *              likesel                 - Selectivity of LIKE pattern match.
1150  */
1151 Datum
1152 likesel(PG_FUNCTION_ARGS)
1153 {
1154         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1155 }
1156
1157 /*
1158  *              iclikesel                       - Selectivity of ILIKE pattern match.
1159  */
1160 Datum
1161 iclikesel(PG_FUNCTION_ARGS)
1162 {
1163         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1164 }
1165
1166 /*
1167  *              regexnesel              - Selectivity of regular-expression pattern non-match.
1168  */
1169 Datum
1170 regexnesel(PG_FUNCTION_ARGS)
1171 {
1172         double          result;
1173
1174         result = patternsel(fcinfo, Pattern_Type_Regex);
1175         result = 1.0 - result;
1176         PG_RETURN_FLOAT8(result);
1177 }
1178
1179 /*
1180  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
1181  */
1182 Datum
1183 icregexnesel(PG_FUNCTION_ARGS)
1184 {
1185         double          result;
1186
1187         result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1188         result = 1.0 - result;
1189         PG_RETURN_FLOAT8(result);
1190 }
1191
1192 /*
1193  *              nlikesel                - Selectivity of LIKE pattern non-match.
1194  */
1195 Datum
1196 nlikesel(PG_FUNCTION_ARGS)
1197 {
1198         double          result;
1199
1200         result = patternsel(fcinfo, Pattern_Type_Like);
1201         result = 1.0 - result;
1202         PG_RETURN_FLOAT8(result);
1203 }
1204
1205 /*
1206  *              icnlikesel              - Selectivity of ILIKE pattern non-match.
1207  */
1208 Datum
1209 icnlikesel(PG_FUNCTION_ARGS)
1210 {
1211         double          result;
1212
1213         result = patternsel(fcinfo, Pattern_Type_Like_IC);
1214         result = 1.0 - result;
1215         PG_RETURN_FLOAT8(result);
1216 }
1217
1218 /*
1219  *              booltestsel             - Selectivity of BooleanTest Node.
1220  */
1221 Selectivity
1222 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1223                         int varRelid, JoinType jointype)
1224 {
1225         VariableStatData vardata;
1226         double          selec;
1227
1228         examine_variable(root, arg, varRelid, &vardata);
1229
1230         if (HeapTupleIsValid(vardata.statsTuple))
1231         {
1232                 Form_pg_statistic stats;
1233                 double          freq_null;
1234                 Datum      *values;
1235                 int                     nvalues;
1236                 float4     *numbers;
1237                 int                     nnumbers;
1238
1239                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1240                 freq_null = stats->stanullfrac;
1241
1242                 if (get_attstatsslot(vardata.statsTuple,
1243                                                          vardata.atttype, vardata.atttypmod,
1244                                                          STATISTIC_KIND_MCV, InvalidOid,
1245                                                          &values, &nvalues,
1246                                                          &numbers, &nnumbers)
1247                         && nnumbers > 0)
1248                 {
1249                         double          freq_true;
1250                         double          freq_false;
1251
1252                         /*
1253                          * Get first MCV frequency and derive frequency for true.
1254                          */
1255                         if (DatumGetBool(values[0]))
1256                                 freq_true = numbers[0];
1257                         else
1258                                 freq_true = 1.0 - numbers[0] - freq_null;
1259
1260                         /*
1261                          * Next derive frequency for false. Then use these as appropriate
1262                          * to derive frequency for each case.
1263                          */
1264                         freq_false = 1.0 - freq_true - freq_null;
1265
1266                         switch (booltesttype)
1267                         {
1268                                 case IS_UNKNOWN:
1269                                         /* select only NULL values */
1270                                         selec = freq_null;
1271                                         break;
1272                                 case IS_NOT_UNKNOWN:
1273                                         /* select non-NULL values */
1274                                         selec = 1.0 - freq_null;
1275                                         break;
1276                                 case IS_TRUE:
1277                                         /* select only TRUE values */
1278                                         selec = freq_true;
1279                                         break;
1280                                 case IS_NOT_TRUE:
1281                                         /* select non-TRUE values */
1282                                         selec = 1.0 - freq_true;
1283                                         break;
1284                                 case IS_FALSE:
1285                                         /* select only FALSE values */
1286                                         selec = freq_false;
1287                                         break;
1288                                 case IS_NOT_FALSE:
1289                                         /* select non-FALSE values */
1290                                         selec = 1.0 - freq_false;
1291                                         break;
1292                                 default:
1293                                         elog(ERROR, "unrecognized booltesttype: %d",
1294                                                  (int) booltesttype);
1295                                         selec = 0.0;    /* Keep compiler quiet */
1296                                         break;
1297                         }
1298
1299                         free_attstatsslot(vardata.atttype, values, nvalues,
1300                                                           numbers, nnumbers);
1301                 }
1302                 else
1303                 {
1304                         /*
1305                          * No most-common-value info available. Still have null fraction
1306                          * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
1307                          * for null fraction and assume an even split for boolean tests.
1308                          */
1309                         switch (booltesttype)
1310                         {
1311                                 case IS_UNKNOWN:
1312
1313                                         /*
1314                                          * Use freq_null directly.
1315                                          */
1316                                         selec = freq_null;
1317                                         break;
1318                                 case IS_NOT_UNKNOWN:
1319
1320                                         /*
1321                                          * Select not unknown (not null) values. Calculate from
1322                                          * freq_null.
1323                                          */
1324                                         selec = 1.0 - freq_null;
1325                                         break;
1326                                 case IS_TRUE:
1327                                 case IS_NOT_TRUE:
1328                                 case IS_FALSE:
1329                                 case IS_NOT_FALSE:
1330                                         selec = (1.0 - freq_null) / 2.0;
1331                                         break;
1332                                 default:
1333                                         elog(ERROR, "unrecognized booltesttype: %d",
1334                                                  (int) booltesttype);
1335                                         selec = 0.0;    /* Keep compiler quiet */
1336                                         break;
1337                         }
1338                 }
1339         }
1340         else
1341         {
1342                 /*
1343                  * If we can't get variable statistics for the argument, perhaps
1344                  * clause_selectivity can do something with it.  We ignore the
1345                  * possibility of a NULL value when using clause_selectivity, and just
1346                  * assume the value is either TRUE or FALSE.
1347                  */
1348                 switch (booltesttype)
1349                 {
1350                         case IS_UNKNOWN:
1351                                 selec = DEFAULT_UNK_SEL;
1352                                 break;
1353                         case IS_NOT_UNKNOWN:
1354                                 selec = DEFAULT_NOT_UNK_SEL;
1355                                 break;
1356                         case IS_TRUE:
1357                         case IS_NOT_FALSE:
1358                                 selec = (double) clause_selectivity(root, arg,
1359                                                                                                         varRelid, jointype);
1360                                 break;
1361                         case IS_FALSE:
1362                         case IS_NOT_TRUE:
1363                                 selec = 1.0 - (double) clause_selectivity(root, arg,
1364                                                                                                                   varRelid, jointype);
1365                                 break;
1366                         default:
1367                                 elog(ERROR, "unrecognized booltesttype: %d",
1368                                          (int) booltesttype);
1369                                 selec = 0.0;    /* Keep compiler quiet */
1370                                 break;
1371                 }
1372         }
1373
1374         ReleaseVariableStats(vardata);
1375
1376         /* result should be in range, but make sure... */
1377         CLAMP_PROBABILITY(selec);
1378
1379         return (Selectivity) selec;
1380 }
1381
1382 /*
1383  *              nulltestsel             - Selectivity of NullTest Node.
1384  */
1385 Selectivity
1386 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1387                         Node *arg, int varRelid)
1388 {
1389         VariableStatData vardata;
1390         double          selec;
1391
1392         examine_variable(root, arg, varRelid, &vardata);
1393
1394         if (HeapTupleIsValid(vardata.statsTuple))
1395         {
1396                 Form_pg_statistic stats;
1397                 double          freq_null;
1398
1399                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1400                 freq_null = stats->stanullfrac;
1401
1402                 switch (nulltesttype)
1403                 {
1404                         case IS_NULL:
1405
1406                                 /*
1407                                  * Use freq_null directly.
1408                                  */
1409                                 selec = freq_null;
1410                                 break;
1411                         case IS_NOT_NULL:
1412
1413                                 /*
1414                                  * Select not unknown (not null) values. Calculate from
1415                                  * freq_null.
1416                                  */
1417                                 selec = 1.0 - freq_null;
1418                                 break;
1419                         default:
1420                                 elog(ERROR, "unrecognized nulltesttype: %d",
1421                                          (int) nulltesttype);
1422                                 return (Selectivity) 0; /* keep compiler quiet */
1423                 }
1424         }
1425         else
1426         {
1427                 /*
1428                  * No ANALYZE stats available, so make a guess
1429                  */
1430                 switch (nulltesttype)
1431                 {
1432                         case IS_NULL:
1433                                 selec = DEFAULT_UNK_SEL;
1434                                 break;
1435                         case IS_NOT_NULL:
1436                                 selec = DEFAULT_NOT_UNK_SEL;
1437                                 break;
1438                         default:
1439                                 elog(ERROR, "unrecognized nulltesttype: %d",
1440                                          (int) nulltesttype);
1441                                 return (Selectivity) 0; /* keep compiler quiet */
1442                 }
1443         }
1444
1445         ReleaseVariableStats(vardata);
1446
1447         /* result should be in range, but make sure... */
1448         CLAMP_PROBABILITY(selec);
1449
1450         return (Selectivity) selec;
1451 }
1452
1453 /*
1454  * strip_array_coercion - strip binary-compatible relabeling from an array expr
1455  *
1456  * For array values, the parser doesn't generate simple RelabelType nodes,
1457  * but function calls of array_type_coerce() or array_type_length_coerce().
1458  * If we want to cope with binary-compatible situations we have to look
1459  * through these calls whenever the element-type coercion is binary-compatible.
1460  */
1461 static Node *
1462 strip_array_coercion(Node *node)
1463 {
1464         /* could be more than one level, so loop */
1465         for (;;)
1466         {
1467                 if (node && IsA(node, RelabelType))
1468                 {
1469                         /* We don't really expect this case, but may as well cope */
1470                         node = (Node *) ((RelabelType *) node)->arg;
1471                 }
1472                 else if (node && IsA(node, FuncExpr))
1473                 {
1474                         FuncExpr   *fexpr = (FuncExpr *) node;
1475                         Node       *arg1;
1476                         Oid                     src_elem_type;
1477                         Oid                     tgt_elem_type;
1478                         Oid                     funcId;
1479
1480                         /* must be the right function(s) */
1481                         if (!(fexpr->funcid == F_ARRAY_TYPE_COERCE ||
1482                                   fexpr->funcid == F_ARRAY_TYPE_LENGTH_COERCE))
1483                                 break;
1484
1485                         /* fetch source and destination array element types */
1486                         arg1 = (Node *) linitial(fexpr->args);
1487                         src_elem_type = get_element_type(exprType(arg1));
1488                         if (src_elem_type == InvalidOid)
1489                                 break;                  /* probably shouldn't happen */
1490                         tgt_elem_type = get_element_type(fexpr->funcresulttype);
1491                         if (tgt_elem_type == InvalidOid)
1492                                 break;                  /* probably shouldn't happen */
1493
1494                         /* find out how to coerce */
1495                         if (!find_coercion_pathway(tgt_elem_type, src_elem_type,
1496                                                                            COERCION_EXPLICIT, &funcId))
1497                                 break;                  /* definitely shouldn't happen */
1498
1499                         if (OidIsValid(funcId))
1500                                 break;                  /* non-binary-compatible coercion */
1501
1502                         node = arg1;            /* OK to look through the node */
1503                 }
1504                 else
1505                         break;
1506         }
1507         return node;
1508 }
1509
1510 /*
1511  *              scalararraysel          - Selectivity of ScalarArrayOpExpr Node.
1512  */
1513 Selectivity
1514 scalararraysel(PlannerInfo *root,
1515                            ScalarArrayOpExpr *clause,
1516                            bool is_join_clause,
1517                            int varRelid, JoinType jointype)
1518 {
1519         Oid                     operator = clause->opno;
1520         bool            useOr = clause->useOr;
1521         Node       *leftop;
1522         Node       *rightop;
1523         Oid                     nominal_element_type;
1524         RegProcedure oprsel;
1525         FmgrInfo        oprselproc;
1526         Datum           selarg4;
1527         Selectivity s1;
1528
1529         /*
1530          * First, look up the underlying operator's selectivity estimator. Punt if
1531          * it hasn't got one.
1532          */
1533         if (is_join_clause)
1534         {
1535                 oprsel = get_oprjoin(operator);
1536                 selarg4 = Int16GetDatum(jointype);
1537         }
1538         else
1539         {
1540                 oprsel = get_oprrest(operator);
1541                 selarg4 = Int32GetDatum(varRelid);
1542         }
1543         if (!oprsel)
1544                 return (Selectivity) 0.5;
1545         fmgr_info(oprsel, &oprselproc);
1546
1547         /* deconstruct the expression */
1548         Assert(list_length(clause->args) == 2);
1549         leftop = (Node *) linitial(clause->args);
1550         rightop = (Node *) lsecond(clause->args);
1551
1552         /* get nominal (after relabeling) element type of rightop */
1553         nominal_element_type = get_element_type(exprType(rightop));
1554         if (!OidIsValid(nominal_element_type))
1555                 return (Selectivity) 0.5;                       /* probably shouldn't happen */
1556
1557         /* look through any binary-compatible relabeling of rightop */
1558         rightop = strip_array_coercion(rightop);
1559
1560         /*
1561          * We consider three cases:
1562          *
1563          * 1. rightop is an Array constant: deconstruct the array, apply the
1564          * operator's selectivity function for each array element, and merge the
1565          * results in the same way that clausesel.c does for AND/OR combinations.
1566          *
1567          * 2. rightop is an ARRAY[] construct: apply the operator's selectivity
1568          * function for each element of the ARRAY[] construct, and merge.
1569          *
1570          * 3. otherwise, make a guess ...
1571          */
1572         if (rightop && IsA(rightop, Const))
1573         {
1574                 Datum           arraydatum = ((Const *) rightop)->constvalue;
1575                 bool            arrayisnull = ((Const *) rightop)->constisnull;
1576                 ArrayType  *arrayval;
1577                 int16           elmlen;
1578                 bool            elmbyval;
1579                 char            elmalign;
1580                 int                     num_elems;
1581                 Datum      *elem_values;
1582                 bool       *elem_nulls;
1583                 int                     i;
1584
1585                 if (arrayisnull)                /* qual can't succeed if null array */
1586                         return (Selectivity) 0.0;
1587                 arrayval = DatumGetArrayTypeP(arraydatum);
1588                 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
1589                                                          &elmlen, &elmbyval, &elmalign);
1590                 deconstruct_array(arrayval,
1591                                                   ARR_ELEMTYPE(arrayval),
1592                                                   elmlen, elmbyval, elmalign,
1593                                                   &elem_values, &elem_nulls, &num_elems);
1594                 s1 = useOr ? 0.0 : 1.0;
1595                 for (i = 0; i < num_elems; i++)
1596                 {
1597                         List       *args;
1598                         Selectivity s2;
1599
1600                         args = list_make2(leftop,
1601                                                           makeConst(nominal_element_type,
1602                                                                                 elmlen,
1603                                                                                 elem_values[i],
1604                                                                                 elem_nulls[i],
1605                                                                                 elmbyval));
1606                         s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1607                                                                                           PointerGetDatum(root),
1608                                                                                           ObjectIdGetDatum(operator),
1609                                                                                           PointerGetDatum(args),
1610                                                                                           selarg4));
1611                         if (useOr)
1612                                 s1 = s1 + s2 - s1 * s2;
1613                         else
1614                                 s1 = s1 * s2;
1615                 }
1616         }
1617         else if (rightop && IsA(rightop, ArrayExpr) &&
1618                          !((ArrayExpr *) rightop)->multidims)
1619         {
1620                 ArrayExpr  *arrayexpr = (ArrayExpr *) rightop;
1621                 int16           elmlen;
1622                 bool            elmbyval;
1623                 ListCell   *l;
1624
1625                 get_typlenbyval(arrayexpr->element_typeid,
1626                                                 &elmlen, &elmbyval);
1627                 s1 = useOr ? 0.0 : 1.0;
1628                 foreach(l, arrayexpr->elements)
1629                 {
1630                         Node       *elem = (Node *) lfirst(l);
1631                         List       *args;
1632                         Selectivity s2;
1633
1634                         /*
1635                          * Theoretically, if elem isn't of nominal_element_type we should
1636                          * insert a RelabelType, but it seems unlikely that any operator
1637                          * estimation function would really care ...
1638                          */
1639                         args = list_make2(leftop, elem);
1640                         s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1641                                                                                           PointerGetDatum(root),
1642                                                                                           ObjectIdGetDatum(operator),
1643                                                                                           PointerGetDatum(args),
1644                                                                                           selarg4));
1645                         if (useOr)
1646                                 s1 = s1 + s2 - s1 * s2;
1647                         else
1648                                 s1 = s1 * s2;
1649                 }
1650         }
1651         else
1652         {
1653                 CaseTestExpr *dummyexpr;
1654                 List       *args;
1655                 Selectivity s2;
1656                 int                     i;
1657
1658                 /*
1659                  * We need a dummy rightop to pass to the operator selectivity
1660                  * routine.  It can be pretty much anything that doesn't look like a
1661                  * constant; CaseTestExpr is a convenient choice.
1662                  */
1663                 dummyexpr = makeNode(CaseTestExpr);
1664                 dummyexpr->typeId = nominal_element_type;
1665                 dummyexpr->typeMod = -1;
1666                 args = list_make2(leftop, dummyexpr);
1667                 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1668                                                                                   PointerGetDatum(root),
1669                                                                                   ObjectIdGetDatum(operator),
1670                                                                                   PointerGetDatum(args),
1671                                                                                   selarg4));
1672                 s1 = useOr ? 0.0 : 1.0;
1673
1674                 /*
1675                  * Arbitrarily assume 10 elements in the eventual array value (see
1676                  * also estimate_array_length)
1677                  */
1678                 for (i = 0; i < 10; i++)
1679                 {
1680                         if (useOr)
1681                                 s1 = s1 + s2 - s1 * s2;
1682                         else
1683                                 s1 = s1 * s2;
1684                 }
1685         }
1686
1687         /* result should be in range, but make sure... */
1688         CLAMP_PROBABILITY(s1);
1689
1690         return s1;
1691 }
1692
1693 /*
1694  * Estimate number of elements in the array yielded by an expression.
1695  *
1696  * It's important that this agree with scalararraysel.
1697  */
1698 int
1699 estimate_array_length(Node *arrayexpr)
1700 {
1701         /* look through any binary-compatible relabeling of arrayexpr */
1702         arrayexpr = strip_array_coercion(arrayexpr);
1703
1704         if (arrayexpr && IsA(arrayexpr, Const))
1705         {
1706                 Datum           arraydatum = ((Const *) arrayexpr)->constvalue;
1707                 bool            arrayisnull = ((Const *) arrayexpr)->constisnull;
1708                 ArrayType  *arrayval;
1709
1710                 if (arrayisnull)
1711                         return 0;
1712                 arrayval = DatumGetArrayTypeP(arraydatum);
1713                 return ArrayGetNItems(ARR_NDIM(arrayval), ARR_DIMS(arrayval));
1714         }
1715         else if (arrayexpr && IsA(arrayexpr, ArrayExpr) &&
1716                          !((ArrayExpr *) arrayexpr)->multidims)
1717         {
1718                 return list_length(((ArrayExpr *) arrayexpr)->elements);
1719         }
1720         else
1721         {
1722                 /* default guess --- see also scalararraysel */
1723                 return 10;
1724         }
1725 }
1726
1727 /*
1728  *              rowcomparesel           - Selectivity of RowCompareExpr Node.
1729  *
1730  * We estimate RowCompare selectivity by considering just the first (high
1731  * order) columns, which makes it equivalent to an ordinary OpExpr.  While
1732  * this estimate could be refined by considering additional columns, it
1733  * seems unlikely that we could do a lot better without multi-column
1734  * statistics.
1735  */
1736 Selectivity
1737 rowcomparesel(PlannerInfo *root,
1738                           RowCompareExpr *clause,
1739                           int varRelid, JoinType jointype)
1740 {
1741         Selectivity s1;
1742         Oid                     opno = linitial_oid(clause->opnos);
1743         List       *opargs;
1744         bool            is_join_clause;
1745
1746         /* Build equivalent arg list for single operator */
1747         opargs = list_make2(linitial(clause->largs), linitial(clause->rargs));
1748
1749         /* Decide if it's a join clause, same as for OpExpr */
1750         if (varRelid != 0)
1751         {
1752                 /*
1753                  * If we are considering a nestloop join then all clauses are
1754                  * restriction clauses, since we are only interested in the one
1755                  * relation.
1756                  */
1757                 is_join_clause = false;
1758         }
1759         else
1760         {
1761                 /*
1762                  * Otherwise, it's a join if there's more than one relation used.
1763                  * Notice we ignore the low-order columns here.
1764                  */
1765                 is_join_clause = (NumRelids((Node *) opargs) > 1);
1766         }
1767
1768         if (is_join_clause)
1769         {
1770                 /* Estimate selectivity for a join clause. */
1771                 s1 = join_selectivity(root, opno,
1772                                                           opargs,
1773                                                           jointype);
1774         }
1775         else
1776         {
1777                 /* Estimate selectivity for a restriction clause. */
1778                 s1 = restriction_selectivity(root, opno,
1779                                                                          opargs,
1780                                                                          varRelid);
1781         }
1782
1783         return s1;
1784 }
1785
1786 /*
1787  *              eqjoinsel               - Join selectivity of "="
1788  */
1789 Datum
1790 eqjoinsel(PG_FUNCTION_ARGS)
1791 {
1792         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1793         Oid                     operator = PG_GETARG_OID(1);
1794         List       *args = (List *) PG_GETARG_POINTER(2);
1795         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1796         double          selec;
1797         VariableStatData vardata1;
1798         VariableStatData vardata2;
1799         double          nd1;
1800         double          nd2;
1801         Form_pg_statistic stats1 = NULL;
1802         Form_pg_statistic stats2 = NULL;
1803         bool            have_mcvs1 = false;
1804         Datum      *values1 = NULL;
1805         int                     nvalues1 = 0;
1806         float4     *numbers1 = NULL;
1807         int                     nnumbers1 = 0;
1808         bool            have_mcvs2 = false;
1809         Datum      *values2 = NULL;
1810         int                     nvalues2 = 0;
1811         float4     *numbers2 = NULL;
1812         int                     nnumbers2 = 0;
1813
1814         get_join_variables(root, args, &vardata1, &vardata2);
1815
1816         nd1 = get_variable_numdistinct(&vardata1);
1817         nd2 = get_variable_numdistinct(&vardata2);
1818
1819         if (HeapTupleIsValid(vardata1.statsTuple))
1820         {
1821                 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1822                 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1823                                                                           vardata1.atttype,
1824                                                                           vardata1.atttypmod,
1825                                                                           STATISTIC_KIND_MCV,
1826                                                                           InvalidOid,
1827                                                                           &values1, &nvalues1,
1828                                                                           &numbers1, &nnumbers1);
1829         }
1830
1831         if (HeapTupleIsValid(vardata2.statsTuple))
1832         {
1833                 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1834                 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1835                                                                           vardata2.atttype,
1836                                                                           vardata2.atttypmod,
1837                                                                           STATISTIC_KIND_MCV,
1838                                                                           InvalidOid,
1839                                                                           &values2, &nvalues2,
1840                                                                           &numbers2, &nnumbers2);
1841         }
1842
1843         if (have_mcvs1 && have_mcvs2)
1844         {
1845                 /*
1846                  * We have most-common-value lists for both relations.  Run through
1847                  * the lists to see which MCVs actually join to each other with the
1848                  * given operator.      This allows us to determine the exact join
1849                  * selectivity for the portion of the relations represented by the MCV
1850                  * lists.  We still have to estimate for the remaining population, but
1851                  * in a skewed distribution this gives us a big leg up in accuracy.
1852                  * For motivation see the analysis in Y. Ioannidis and S.
1853                  * Christodoulakis, "On the propagation of errors in the size of join
1854                  * results", Technical Report 1018, Computer Science Dept., University
1855                  * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
1856                  */
1857                 FmgrInfo        eqproc;
1858                 bool       *hasmatch1;
1859                 bool       *hasmatch2;
1860                 double          nullfrac1 = stats1->stanullfrac;
1861                 double          nullfrac2 = stats2->stanullfrac;
1862                 double          matchprodfreq,
1863                                         matchfreq1,
1864                                         matchfreq2,
1865                                         unmatchfreq1,
1866                                         unmatchfreq2,
1867                                         otherfreq1,
1868                                         otherfreq2,
1869                                         totalsel1,
1870                                         totalsel2;
1871                 int                     i,
1872                                         nmatches;
1873
1874                 fmgr_info(get_opcode(operator), &eqproc);
1875                 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1876                 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1877
1878                 /*
1879                  * If we are doing any variant of JOIN_IN, pretend all the values of
1880                  * the righthand relation are unique (ie, act as if it's been
1881                  * DISTINCT'd).
1882                  *
1883                  * NOTE: it might seem that we should unique-ify the lefthand input
1884                  * when considering JOIN_REVERSE_IN.  But this is not so, because the
1885                  * join clause we've been handed has not been commuted from the way
1886                  * the parser originally wrote it.      We know that the unique side of
1887                  * the IN clause is *always* on the right.
1888                  *
1889                  * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1890                  * JOIN_RIGHT here, because we do not have enough information to
1891                  * determine which var is really on which side of the join. Perhaps
1892                  * someday we should pass in more information.
1893                  */
1894                 if (jointype == JOIN_IN ||
1895                         jointype == JOIN_REVERSE_IN ||
1896                         jointype == JOIN_UNIQUE_INNER ||
1897                         jointype == JOIN_UNIQUE_OUTER)
1898                 {
1899                         float4          oneovern = 1.0 / nd2;
1900
1901                         for (i = 0; i < nvalues2; i++)
1902                                 numbers2[i] = oneovern;
1903                         nullfrac2 = oneovern;
1904                 }
1905
1906                 /*
1907                  * Note we assume that each MCV will match at most one member of the
1908                  * other MCV list.      If the operator isn't really equality, there could
1909                  * be multiple matches --- but we don't look for them, both for speed
1910                  * and because the math wouldn't add up...
1911                  */
1912                 matchprodfreq = 0.0;
1913                 nmatches = 0;
1914                 for (i = 0; i < nvalues1; i++)
1915                 {
1916                         int                     j;
1917
1918                         for (j = 0; j < nvalues2; j++)
1919                         {
1920                                 if (hasmatch2[j])
1921                                         continue;
1922                                 if (DatumGetBool(FunctionCall2(&eqproc,
1923                                                                                            values1[i],
1924                                                                                            values2[j])))
1925                                 {
1926                                         hasmatch1[i] = hasmatch2[j] = true;
1927                                         matchprodfreq += numbers1[i] * numbers2[j];
1928                                         nmatches++;
1929                                         break;
1930                                 }
1931                         }
1932                 }
1933                 CLAMP_PROBABILITY(matchprodfreq);
1934                 /* Sum up frequencies of matched and unmatched MCVs */
1935                 matchfreq1 = unmatchfreq1 = 0.0;
1936                 for (i = 0; i < nvalues1; i++)
1937                 {
1938                         if (hasmatch1[i])
1939                                 matchfreq1 += numbers1[i];
1940                         else
1941                                 unmatchfreq1 += numbers1[i];
1942                 }
1943                 CLAMP_PROBABILITY(matchfreq1);
1944                 CLAMP_PROBABILITY(unmatchfreq1);
1945                 matchfreq2 = unmatchfreq2 = 0.0;
1946                 for (i = 0; i < nvalues2; i++)
1947                 {
1948                         if (hasmatch2[i])
1949                                 matchfreq2 += numbers2[i];
1950                         else
1951                                 unmatchfreq2 += numbers2[i];
1952                 }
1953                 CLAMP_PROBABILITY(matchfreq2);
1954                 CLAMP_PROBABILITY(unmatchfreq2);
1955                 pfree(hasmatch1);
1956                 pfree(hasmatch2);
1957
1958                 /*
1959                  * Compute total frequency of non-null values that are not in the MCV
1960                  * lists.
1961                  */
1962                 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1963                 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1964                 CLAMP_PROBABILITY(otherfreq1);
1965                 CLAMP_PROBABILITY(otherfreq2);
1966
1967                 /*
1968                  * We can estimate the total selectivity from the point of view of
1969                  * relation 1 as: the known selectivity for matched MCVs, plus
1970                  * unmatched MCVs that are assumed to match against random members of
1971                  * relation 2's non-MCV population, plus non-MCV values that are
1972                  * assumed to match against random members of relation 2's unmatched
1973                  * MCVs plus non-MCV values.
1974                  */
1975                 totalsel1 = matchprodfreq;
1976                 if (nd2 > nvalues2)
1977                         totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1978                 if (nd2 > nmatches)
1979                         totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1980                                 (nd2 - nmatches);
1981                 /* Same estimate from the point of view of relation 2. */
1982                 totalsel2 = matchprodfreq;
1983                 if (nd1 > nvalues1)
1984                         totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1985                 if (nd1 > nmatches)
1986                         totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1987                                 (nd1 - nmatches);
1988
1989                 /*
1990                  * Use the smaller of the two estimates.  This can be justified in
1991                  * essentially the same terms as given below for the no-stats case: to
1992                  * a first approximation, we are estimating from the point of view of
1993                  * the relation with smaller nd.
1994                  */
1995                 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1996         }
1997         else
1998         {
1999                 /*
2000                  * We do not have MCV lists for both sides.  Estimate the join
2001                  * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This
2002                  * is plausible if we assume that the join operator is strict and the
2003                  * non-null values are about equally distributed: a given non-null
2004                  * tuple of rel1 will join to either zero or N2*(1-nullfrac2)/nd2 rows
2005                  * of rel2, so total join rows are at most
2006                  * N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join selectivity of
2007                  * not more than (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it
2008                  * is not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression
2009                  * with MIN() is an upper bound.  Using the MIN() means we estimate
2010                  * from the point of view of the relation with smaller nd (since the
2011                  * larger nd is determining the MIN).  It is reasonable to assume that
2012                  * most tuples in this rel will have join partners, so the bound is
2013                  * probably reasonably tight and should be taken as-is.
2014                  *
2015                  * XXX Can we be smarter if we have an MCV list for just one side? It
2016                  * seems that if we assume equal distribution for the other side, we
2017                  * end up with the same answer anyway.
2018                  */
2019                 double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
2020                 double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
2021
2022                 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
2023                 if (nd1 > nd2)
2024                         selec /= nd1;
2025                 else
2026                         selec /= nd2;
2027         }
2028
2029         if (have_mcvs1)
2030                 free_attstatsslot(vardata1.atttype, values1, nvalues1,
2031                                                   numbers1, nnumbers1);
2032         if (have_mcvs2)
2033                 free_attstatsslot(vardata2.atttype, values2, nvalues2,
2034                                                   numbers2, nnumbers2);
2035
2036         ReleaseVariableStats(vardata1);
2037         ReleaseVariableStats(vardata2);
2038
2039         CLAMP_PROBABILITY(selec);
2040
2041         PG_RETURN_FLOAT8((float8) selec);
2042 }
2043
2044 /*
2045  *              neqjoinsel              - Join selectivity of "!="
2046  */
2047 Datum
2048 neqjoinsel(PG_FUNCTION_ARGS)
2049 {
2050         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
2051         Oid                     operator = PG_GETARG_OID(1);
2052         List       *args = (List *) PG_GETARG_POINTER(2);
2053         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
2054         Oid                     eqop;
2055         float8          result;
2056
2057         /*
2058          * We want 1 - eqjoinsel() where the equality operator is the one
2059          * associated with this != operator, that is, its negator.
2060          */
2061         eqop = get_negator(operator);
2062         if (eqop)
2063         {
2064                 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
2065                                                                                                         PointerGetDatum(root),
2066                                                                                                         ObjectIdGetDatum(eqop),
2067                                                                                                         PointerGetDatum(args),
2068                                                                                                         Int16GetDatum(jointype)));
2069         }
2070         else
2071         {
2072                 /* Use default selectivity (should we raise an error instead?) */
2073                 result = DEFAULT_EQ_SEL;
2074         }
2075         result = 1.0 - result;
2076         PG_RETURN_FLOAT8(result);
2077 }
2078
2079 /*
2080  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
2081  */
2082 Datum
2083 scalarltjoinsel(PG_FUNCTION_ARGS)
2084 {
2085         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
2086 }
2087
2088 /*
2089  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
2090  */
2091 Datum
2092 scalargtjoinsel(PG_FUNCTION_ARGS)
2093 {
2094         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
2095 }
2096
2097 /*
2098  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
2099  */
2100 Datum
2101 regexeqjoinsel(PG_FUNCTION_ARGS)
2102 {
2103         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2104 }
2105
2106 /*
2107  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
2108  */
2109 Datum
2110 icregexeqjoinsel(PG_FUNCTION_ARGS)
2111 {
2112         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2113 }
2114
2115 /*
2116  *              likejoinsel                     - Join selectivity of LIKE pattern match.
2117  */
2118 Datum
2119 likejoinsel(PG_FUNCTION_ARGS)
2120 {
2121         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2122 }
2123
2124 /*
2125  *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
2126  */
2127 Datum
2128 iclikejoinsel(PG_FUNCTION_ARGS)
2129 {
2130         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2131 }
2132
2133 /*
2134  *              regexnejoinsel  - Join selectivity of regex non-match.
2135  */
2136 Datum
2137 regexnejoinsel(PG_FUNCTION_ARGS)
2138 {
2139         float8          result;
2140
2141         result = DatumGetFloat8(regexeqjoinsel(fcinfo));
2142         result = 1.0 - result;
2143         PG_RETURN_FLOAT8(result);
2144 }
2145
2146 /*
2147  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
2148  */
2149 Datum
2150 icregexnejoinsel(PG_FUNCTION_ARGS)
2151 {
2152         float8          result;
2153
2154         result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
2155         result = 1.0 - result;
2156         PG_RETURN_FLOAT8(result);
2157 }
2158
2159 /*
2160  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
2161  */
2162 Datum
2163 nlikejoinsel(PG_FUNCTION_ARGS)
2164 {
2165         float8          result;
2166
2167         result = DatumGetFloat8(likejoinsel(fcinfo));
2168         result = 1.0 - result;
2169         PG_RETURN_FLOAT8(result);
2170 }
2171
2172 /*
2173  *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
2174  */
2175 Datum
2176 icnlikejoinsel(PG_FUNCTION_ARGS)
2177 {
2178         float8          result;
2179
2180         result = DatumGetFloat8(iclikejoinsel(fcinfo));
2181         result = 1.0 - result;
2182         PG_RETURN_FLOAT8(result);
2183 }
2184
2185 /*
2186  * mergejoinscansel                     - Scan selectivity of merge join.
2187  *
2188  * A merge join will stop as soon as it exhausts either input stream.
2189  * Therefore, if we can estimate the ranges of both input variables,
2190  * we can estimate how much of the input will actually be read.  This
2191  * can have a considerable impact on the cost when using indexscans.
2192  *
2193  * clause should be a clause already known to be mergejoinable.  opfamily,
2194  * strategy, and nulls_first specify the sort ordering being used.
2195  *
2196  * *leftscan is set to the fraction of the left-hand variable expected
2197  * to be scanned (0 to 1), and similarly *rightscan for the right-hand
2198  * variable.
2199  */
2200 void
2201 mergejoinscansel(PlannerInfo *root, Node *clause,
2202                                  Oid opfamily, int strategy, bool nulls_first,
2203                                  Selectivity *leftscan,
2204                                  Selectivity *rightscan)
2205 {
2206         Node       *left,
2207                            *right;
2208         VariableStatData leftvar,
2209                                 rightvar;
2210         int                     op_strategy;
2211         Oid                     op_lefttype;
2212         Oid                     op_righttype;
2213         bool            op_recheck;
2214         Oid                     opno,
2215                                 lsortop,
2216                                 rsortop,
2217                                 leop,
2218                                 revleop;
2219         Datum           leftmax,
2220                                 rightmax;
2221         double          selec;
2222
2223         /* Set default results if we can't figure anything out. */
2224         *leftscan = *rightscan = 1.0;
2225
2226         /* Deconstruct the merge clause */
2227         if (!is_opclause(clause))
2228                 return;                                 /* shouldn't happen */
2229         opno = ((OpExpr *) clause)->opno;
2230         left = get_leftop((Expr *) clause);
2231         right = get_rightop((Expr *) clause);
2232         if (!right)
2233                 return;                                 /* shouldn't happen */
2234
2235         /* Look for stats for the inputs */
2236         examine_variable(root, left, 0, &leftvar);
2237         examine_variable(root, right, 0, &rightvar);
2238
2239         /* Extract the operator's declared left/right datatypes */
2240         get_op_opfamily_properties(opno, opfamily,
2241                                                            &op_strategy,
2242                                                            &op_lefttype,
2243                                                            &op_righttype,
2244                                                            &op_recheck);
2245         Assert(op_strategy == BTEqualStrategyNumber);
2246         Assert(!op_recheck);
2247
2248         /*
2249          * Look up the various operators we need.  If we don't find them all,
2250          * it probably means the opfamily is broken, but we cope anyway.
2251          */
2252         switch (strategy)
2253         {
2254                 case BTLessStrategyNumber:
2255                         lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
2256                                                                                   BTLessStrategyNumber);
2257                         rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
2258                                                                                   BTLessStrategyNumber);
2259                         leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
2260                                                                            BTLessEqualStrategyNumber);
2261                         revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
2262                                                                                   BTLessEqualStrategyNumber);
2263                         break;
2264                 case BTGreaterStrategyNumber:
2265                         /* descending-order case */
2266                         lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
2267                                                                                   BTGreaterStrategyNumber);
2268                         rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
2269                                                                                   BTGreaterStrategyNumber);
2270                         leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
2271                                                                            BTGreaterEqualStrategyNumber);
2272                         revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
2273                                                                                   BTGreaterEqualStrategyNumber);
2274                         break;
2275                 default:
2276                         goto fail;                      /* shouldn't get here */
2277         }
2278
2279         if (!OidIsValid(lsortop) ||
2280                 !OidIsValid(rsortop) ||
2281                 !OidIsValid(leop) ||
2282                 !OidIsValid(revleop))
2283                 goto fail;                              /* insufficient info in catalogs */
2284
2285         /* Try to get maximum values of both inputs */
2286         if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
2287                 goto fail;                              /* no max available from stats */
2288
2289         if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
2290                 goto fail;                              /* no max available from stats */
2291
2292         /*
2293          * Now, the fraction of the left variable that will be scanned is the
2294          * fraction that's <= the right-side maximum value.  But only believe
2295          * non-default estimates, else stick with our 1.0.  Also, if the sort
2296          * order is nulls-first, we're going to have to read over any nulls too.
2297          */
2298         selec = scalarineqsel(root, leop, false, &leftvar,
2299                                                   rightmax, op_righttype);
2300         if (selec != DEFAULT_INEQ_SEL)
2301         {
2302                 if (nulls_first && HeapTupleIsValid(leftvar.statsTuple))
2303                 {
2304                         Form_pg_statistic stats;
2305
2306                         stats = (Form_pg_statistic) GETSTRUCT(leftvar.statsTuple);
2307                         selec += stats->stanullfrac;
2308                         CLAMP_PROBABILITY(selec);
2309                 }
2310                 *leftscan = selec;
2311         }
2312
2313         /* And similarly for the right variable. */
2314         selec = scalarineqsel(root, revleop, false, &rightvar,
2315                                                   leftmax, op_lefttype);
2316         if (selec != DEFAULT_INEQ_SEL)
2317         {
2318                 if (nulls_first && HeapTupleIsValid(rightvar.statsTuple))
2319                 {
2320                         Form_pg_statistic stats;
2321
2322                         stats = (Form_pg_statistic) GETSTRUCT(rightvar.statsTuple);
2323                         selec += stats->stanullfrac;
2324                         CLAMP_PROBABILITY(selec);
2325                 }
2326                 *rightscan = selec;
2327         }
2328
2329         /*
2330          * Only one of the two fractions can really be less than 1.0; believe the
2331          * smaller estimate and reset the other one to exactly 1.0.  If we get
2332          * exactly equal estimates (as can easily happen with self-joins), believe
2333          * neither.
2334          */
2335         if (*leftscan > *rightscan)
2336                 *leftscan = 1.0;
2337         else if (*leftscan < *rightscan)
2338                 *rightscan = 1.0;
2339         else
2340                 *leftscan = *rightscan = 1.0;
2341
2342 fail:
2343         ReleaseVariableStats(leftvar);
2344         ReleaseVariableStats(rightvar);
2345 }
2346
2347
2348 /*
2349  * Helper routine for estimate_num_groups: add an item to a list of
2350  * GroupVarInfos, but only if it's not known equal to any of the existing
2351  * entries.
2352  */
2353 typedef struct
2354 {
2355         Node       *var;                        /* might be an expression, not just a Var */
2356         RelOptInfo *rel;                        /* relation it belongs to */
2357         double          ndistinct;              /* # distinct values */
2358 } GroupVarInfo;
2359
2360 static List *
2361 add_unique_group_var(PlannerInfo *root, List *varinfos,
2362                                          Node *var, VariableStatData *vardata)
2363 {
2364         GroupVarInfo *varinfo;
2365         double          ndistinct;
2366         ListCell   *lc;
2367
2368         ndistinct = get_variable_numdistinct(vardata);
2369
2370         /* cannot use foreach here because of possible list_delete */
2371         lc = list_head(varinfos);
2372         while (lc)
2373         {
2374                 varinfo = (GroupVarInfo *) lfirst(lc);
2375
2376                 /* must advance lc before list_delete possibly pfree's it */
2377                 lc = lnext(lc);
2378
2379                 /* Drop exact duplicates */
2380                 if (equal(var, varinfo->var))
2381                         return varinfos;
2382
2383                 /*
2384                  * Drop known-equal vars, but only if they belong to different
2385                  * relations (see comments for estimate_num_groups)
2386                  */
2387                 if (vardata->rel != varinfo->rel &&
2388                         exprs_known_equal(root, var, varinfo->var))
2389                 {
2390                         if (varinfo->ndistinct <= ndistinct)
2391                         {
2392                                 /* Keep older item, forget new one */
2393                                 return varinfos;
2394                         }
2395                         else
2396                         {
2397                                 /* Delete the older item */
2398                                 varinfos = list_delete_ptr(varinfos, varinfo);
2399                         }
2400                 }
2401         }
2402
2403         varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
2404
2405         varinfo->var = var;
2406         varinfo->rel = vardata->rel;
2407         varinfo->ndistinct = ndistinct;
2408         varinfos = lappend(varinfos, varinfo);
2409         return varinfos;
2410 }
2411
2412 /*
2413  * estimate_num_groups          - Estimate number of groups in a grouped query
2414  *
2415  * Given a query having a GROUP BY clause, estimate how many groups there
2416  * will be --- ie, the number of distinct combinations of the GROUP BY
2417  * expressions.
2418  *
2419  * This routine is also used to estimate the number of rows emitted by
2420  * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
2421  * actually, we only use it for DISTINCT when there's no grouping or
2422  * aggregation ahead of the DISTINCT.)
2423  *
2424  * Inputs:
2425  *      root - the query
2426  *      groupExprs - list of expressions being grouped by
2427  *      input_rows - number of rows estimated to arrive at the group/unique
2428  *              filter step
2429  *
2430  * Given the lack of any cross-correlation statistics in the system, it's
2431  * impossible to do anything really trustworthy with GROUP BY conditions
2432  * involving multiple Vars.  We should however avoid assuming the worst
2433  * case (all possible cross-product terms actually appear as groups) since
2434  * very often the grouped-by Vars are highly correlated.  Our current approach
2435  * is as follows:
2436  *      1.      Reduce the given expressions to a list of unique Vars used.  For
2437  *              example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
2438  *              It is clearly correct not to count the same Var more than once.
2439  *              It is also reasonable to treat f(x) the same as x: f() cannot
2440  *              increase the number of distinct values (unless it is volatile,
2441  *              which we consider unlikely for grouping), but it probably won't
2442  *              reduce the number of distinct values much either.
2443  *              As a special case, if a GROUP BY expression can be matched to an
2444  *              expressional index for which we have statistics, then we treat the
2445  *              whole expression as though it were just a Var.
2446  *      2.      If the list contains Vars of different relations that are known equal
2447  *              due to equivalence classes, then drop all but one of the Vars from each
2448  *              known-equal set, keeping the one with smallest estimated # of values
2449  *              (since the extra values of the others can't appear in joined rows).
2450  *              Note the reason we only consider Vars of different relations is that
2451  *              if we considered ones of the same rel, we'd be double-counting the
2452  *              restriction selectivity of the equality in the next step.
2453  *      3.      For Vars within a single source rel, we multiply together the numbers
2454  *              of values, clamp to the number of rows in the rel (divided by 10 if
2455  *              more than one Var), and then multiply by the selectivity of the
2456  *              restriction clauses for that rel.  When there's more than one Var,
2457  *              the initial product is probably too high (it's the worst case) but
2458  *              clamping to a fraction of the rel's rows seems to be a helpful
2459  *              heuristic for not letting the estimate get out of hand.  (The factor
2460  *              of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
2461  *              by the restriction selectivity is effectively assuming that the
2462  *              restriction clauses are independent of the grouping, which is a crummy
2463  *              assumption, but it's hard to do better.
2464  *      4.      If there are Vars from multiple rels, we repeat step 3 for each such
2465  *              rel, and multiply the results together.
2466  * Note that rels not containing grouped Vars are ignored completely, as are
2467  * join clauses.  Such rels cannot increase the number of groups, and we
2468  * assume such clauses do not reduce the number either (somewhat bogus,
2469  * but we don't have the info to do better).
2470  */
2471 double
2472 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
2473 {
2474         List       *varinfos = NIL;
2475         double          numdistinct;
2476         ListCell   *l;
2477
2478         /* We should not be called unless query has GROUP BY (or DISTINCT) */
2479         Assert(groupExprs != NIL);
2480
2481         /*
2482          * Steps 1/2: find the unique Vars used, treating an expression as a Var
2483          * if we can find stats for it.  For each one, record the statistical
2484          * estimate of number of distinct values (total in its table, without
2485          * regard for filtering).
2486          */
2487         foreach(l, groupExprs)
2488         {
2489                 Node       *groupexpr = (Node *) lfirst(l);
2490                 VariableStatData vardata;
2491                 List       *varshere;
2492                 ListCell   *l2;
2493
2494                 /*
2495                  * If examine_variable is able to deduce anything about the GROUP BY
2496                  * expression, treat it as a single variable even if it's really more
2497                  * complicated.
2498                  */
2499                 examine_variable(root, groupexpr, 0, &vardata);
2500                 if (vardata.statsTuple != NULL || vardata.isunique)
2501                 {
2502                         varinfos = add_unique_group_var(root, varinfos,
2503                                                                                         groupexpr, &vardata);
2504                         ReleaseVariableStats(vardata);
2505                         continue;
2506                 }
2507                 ReleaseVariableStats(vardata);
2508
2509                 /*
2510                  * Else pull out the component Vars
2511                  */
2512                 varshere = pull_var_clause(groupexpr, false);
2513
2514                 /*
2515                  * If we find any variable-free GROUP BY item, then either it is a
2516                  * constant (and we can ignore it) or it contains a volatile function;
2517                  * in the latter case we punt and assume that each input row will
2518                  * yield a distinct group.
2519                  */
2520                 if (varshere == NIL)
2521                 {
2522                         if (contain_volatile_functions(groupexpr))
2523                                 return input_rows;
2524                         continue;
2525                 }
2526
2527                 /*
2528                  * Else add variables to varinfos list
2529                  */
2530                 foreach(l2, varshere)
2531                 {
2532                         Node       *var = (Node *) lfirst(l2);
2533
2534                         examine_variable(root, var, 0, &vardata);
2535                         varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2536                         ReleaseVariableStats(vardata);
2537                 }
2538         }
2539
2540         /* If now no Vars, we must have an all-constant GROUP BY list. */
2541         if (varinfos == NIL)
2542                 return 1.0;
2543
2544         /*
2545          * Steps 3/4: group Vars by relation and estimate total numdistinct.
2546          *
2547          * For each iteration of the outer loop, we process the frontmost Var in
2548          * varinfos, plus all other Vars in the same relation.  We remove these
2549          * Vars from the newvarinfos list for the next iteration. This is the
2550          * easiest way to group Vars of same rel together.
2551          */
2552         numdistinct = 1.0;
2553
2554         do
2555         {
2556                 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2557                 RelOptInfo *rel = varinfo1->rel;
2558                 double          reldistinct = varinfo1->ndistinct;
2559                 double          relmaxndistinct = reldistinct;
2560                 int                     relvarcount = 1;
2561                 List       *newvarinfos = NIL;
2562
2563                 /*
2564                  * Get the product of numdistinct estimates of the Vars for this rel.
2565                  * Also, construct new varinfos list of remaining Vars.
2566                  */
2567                 for_each_cell(l, lnext(list_head(varinfos)))
2568                 {
2569                         GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2570
2571                         if (varinfo2->rel == varinfo1->rel)
2572                         {
2573                                 reldistinct *= varinfo2->ndistinct;
2574                                 if (relmaxndistinct < varinfo2->ndistinct)
2575                                         relmaxndistinct = varinfo2->ndistinct;
2576                                 relvarcount++;
2577                         }
2578                         else
2579                         {
2580                                 /* not time to process varinfo2 yet */
2581                                 newvarinfos = lcons(varinfo2, newvarinfos);
2582                         }
2583                 }
2584
2585                 /*
2586                  * Sanity check --- don't divide by zero if empty relation.
2587                  */
2588                 Assert(rel->reloptkind == RELOPT_BASEREL);
2589                 if (rel->tuples > 0)
2590                 {
2591                         /*
2592                          * Clamp to size of rel, or size of rel / 10 if multiple Vars. The
2593                          * fudge factor is because the Vars are probably correlated but we
2594                          * don't know by how much.  We should never clamp to less than the
2595                          * largest ndistinct value for any of the Vars, though, since
2596                          * there will surely be at least that many groups.
2597                          */
2598                         double          clamp = rel->tuples;
2599
2600                         if (relvarcount > 1)
2601                         {
2602                                 clamp *= 0.1;
2603                                 if (clamp < relmaxndistinct)
2604                                 {
2605                                         clamp = relmaxndistinct;
2606                                         /* for sanity in case some ndistinct is too large: */
2607                                         if (clamp > rel->tuples)
2608                                                 clamp = rel->tuples;
2609                                 }
2610                         }
2611                         if (reldistinct > clamp)
2612                                 reldistinct = clamp;
2613
2614                         /*
2615                          * Multiply by restriction selectivity.
2616                          */
2617                         reldistinct *= rel->rows / rel->tuples;
2618
2619                         /*
2620                          * Update estimate of total distinct groups.
2621                          */
2622                         numdistinct *= reldistinct;
2623                 }
2624
2625                 varinfos = newvarinfos;
2626         } while (varinfos != NIL);
2627
2628         numdistinct = ceil(numdistinct);
2629
2630         /* Guard against out-of-range answers */
2631         if (numdistinct > input_rows)
2632                 numdistinct = input_rows;
2633         if (numdistinct < 1.0)
2634                 numdistinct = 1.0;
2635
2636         return numdistinct;
2637 }
2638
2639 /*
2640  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2641  * divided by total tuples in relation) if the specified expression is used
2642  * as a hash key.
2643  *
2644  * XXX This is really pretty bogus since we're effectively assuming that the
2645  * distribution of hash keys will be the same after applying restriction
2646  * clauses as it was in the underlying relation.  However, we are not nearly
2647  * smart enough to figure out how the restrict clauses might change the
2648  * distribution, so this will have to do for now.
2649  *
2650  * We are passed the number of buckets the executor will use for the given
2651  * input relation.      If the data were perfectly distributed, with the same
2652  * number of tuples going into each available bucket, then the bucketsize
2653  * fraction would be 1/nbuckets.  But this happy state of affairs will occur
2654  * only if (a) there are at least nbuckets distinct data values, and (b)
2655  * we have a not-too-skewed data distribution.  Otherwise the buckets will
2656  * be nonuniformly occupied.  If the other relation in the join has a key
2657  * distribution similar to this one's, then the most-loaded buckets are
2658  * exactly those that will be probed most often.  Therefore, the "average"
2659  * bucket size for costing purposes should really be taken as something close
2660  * to the "worst case" bucket size.  We try to estimate this by adjusting the
2661  * fraction if there are too few distinct data values, and then scaling up
2662  * by the ratio of the most common value's frequency to the average frequency.
2663  *
2664  * If no statistics are available, use a default estimate of 0.1.  This will
2665  * discourage use of a hash rather strongly if the inner relation is large,
2666  * which is what we want.  We do not want to hash unless we know that the
2667  * inner rel is well-dispersed (or the alternatives seem much worse).
2668  */
2669 Selectivity
2670 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2671 {
2672         VariableStatData vardata;
2673         double          estfract,
2674                                 ndistinct,
2675                                 stanullfrac,
2676                                 mcvfreq,
2677                                 avgfreq;
2678         float4     *numbers;
2679         int                     nnumbers;
2680
2681         examine_variable(root, hashkey, 0, &vardata);
2682
2683         /* Get number of distinct values and fraction that are null */
2684         ndistinct = get_variable_numdistinct(&vardata);
2685
2686         if (HeapTupleIsValid(vardata.statsTuple))
2687         {
2688                 Form_pg_statistic stats;
2689
2690                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2691                 stanullfrac = stats->stanullfrac;
2692         }
2693         else
2694         {
2695                 /*
2696                  * Believe a default ndistinct only if it came from stats. Otherwise
2697                  * punt and return 0.1, per comments above.
2698                  */
2699                 if (ndistinct == DEFAULT_NUM_DISTINCT)
2700                 {
2701                         ReleaseVariableStats(vardata);
2702                         return (Selectivity) 0.1;
2703                 }
2704
2705                 stanullfrac = 0.0;
2706         }
2707
2708         /* Compute avg freq of all distinct data values in raw relation */
2709         avgfreq = (1.0 - stanullfrac) / ndistinct;
2710
2711         /*
2712          * Adjust ndistinct to account for restriction clauses.  Observe we are
2713          * assuming that the data distribution is affected uniformly by the
2714          * restriction clauses!
2715          *
2716          * XXX Possibly better way, but much more expensive: multiply by
2717          * selectivity of rel's restriction clauses that mention the target Var.
2718          */
2719         if (vardata.rel)
2720                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2721
2722         /*
2723          * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
2724          * number of buckets is less than the expected number of distinct values;
2725          * otherwise it is 1/ndistinct.
2726          */
2727         if (ndistinct > nbuckets)
2728                 estfract = 1.0 / nbuckets;
2729         else
2730                 estfract = 1.0 / ndistinct;
2731
2732         /*
2733          * Look up the frequency of the most common value, if available.
2734          */
2735         mcvfreq = 0.0;
2736
2737         if (HeapTupleIsValid(vardata.statsTuple))
2738         {
2739                 if (get_attstatsslot(vardata.statsTuple,
2740                                                          vardata.atttype, vardata.atttypmod,
2741                                                          STATISTIC_KIND_MCV, InvalidOid,
2742                                                          NULL, NULL, &numbers, &nnumbers))
2743                 {
2744                         /*
2745                          * The first MCV stat is for the most common value.
2746                          */
2747                         if (nnumbers > 0)
2748                                 mcvfreq = numbers[0];
2749                         free_attstatsslot(vardata.atttype, NULL, 0,
2750                                                           numbers, nnumbers);
2751                 }
2752         }
2753
2754         /*
2755          * Adjust estimated bucketsize upward to account for skewed distribution.
2756          */
2757         if (avgfreq > 0.0 && mcvfreq > avgfreq)
2758                 estfract *= mcvfreq / avgfreq;
2759
2760         /*
2761          * Clamp bucketsize to sane range (the above adjustment could easily
2762          * produce an out-of-range result).  We set the lower bound a little above
2763          * zero, since zero isn't a very sane result.
2764          */
2765         if (estfract < 1.0e-6)
2766                 estfract = 1.0e-6;
2767         else if (estfract > 1.0)
2768                 estfract = 1.0;
2769
2770         ReleaseVariableStats(vardata);
2771
2772         return (Selectivity) estfract;
2773 }
2774
2775
2776 /*-------------------------------------------------------------------------
2777  *
2778  * Support routines
2779  *
2780  *-------------------------------------------------------------------------
2781  */
2782
2783 /*
2784  * convert_to_scalar
2785  *        Convert non-NULL values of the indicated types to the comparison
2786  *        scale needed by scalarineqsel().
2787  *        Returns "true" if successful.
2788  *
2789  * XXX this routine is a hack: ideally we should look up the conversion
2790  * subroutines in pg_type.
2791  *
2792  * All numeric datatypes are simply converted to their equivalent
2793  * "double" values.  (NUMERIC values that are outside the range of "double"
2794  * are clamped to +/- HUGE_VAL.)
2795  *
2796  * String datatypes are converted by convert_string_to_scalar(),
2797  * which is explained below.  The reason why this routine deals with
2798  * three values at a time, not just one, is that we need it for strings.
2799  *
2800  * The bytea datatype is just enough different from strings that it has
2801  * to be treated separately.
2802  *
2803  * The several datatypes representing absolute times are all converted
2804  * to Timestamp, which is actually a double, and then we just use that
2805  * double value.  Note this will give correct results even for the "special"
2806  * values of Timestamp, since those are chosen to compare correctly;
2807  * see timestamp_cmp.
2808  *
2809  * The several datatypes representing relative times (intervals) are all
2810  * converted to measurements expressed in seconds.
2811  */
2812 static bool
2813 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2814                                   Datum lobound, Datum hibound, Oid boundstypid,
2815                                   double *scaledlobound, double *scaledhibound)
2816 {
2817         /*
2818          * Both the valuetypid and the boundstypid should exactly match the
2819          * declared input type(s) of the operator we are invoked for, so we just
2820          * error out if either is not recognized.
2821          *
2822          * XXX The histogram we are interpolating between points of could belong
2823          * to a column that's only binary-compatible with the declared type. In
2824          * essence we are assuming that the semantics of binary-compatible types
2825          * are enough alike that we can use a histogram generated with one type's
2826          * operators to estimate selectivity for the other's.  This is outright
2827          * wrong in some cases --- in particular signed versus unsigned
2828          * interpretation could trip us up.  But it's useful enough in the
2829          * majority of cases that we do it anyway.      Should think about more
2830          * rigorous ways to do it.
2831          */
2832         switch (valuetypid)
2833         {
2834                         /*
2835                          * Built-in numeric types
2836                          */
2837                 case BOOLOID:
2838                 case INT2OID:
2839                 case INT4OID:
2840                 case INT8OID:
2841                 case FLOAT4OID:
2842                 case FLOAT8OID:
2843                 case NUMERICOID:
2844                 case OIDOID:
2845                 case REGPROCOID:
2846                 case REGPROCEDUREOID:
2847                 case REGOPEROID:
2848                 case REGOPERATOROID:
2849                 case REGCLASSOID:
2850                 case REGTYPEOID:
2851                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2852                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2853                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2854                         return true;
2855
2856                         /*
2857                          * Built-in string types
2858                          */
2859                 case CHAROID:
2860                 case BPCHAROID:
2861                 case VARCHAROID:
2862                 case TEXTOID:
2863                 case NAMEOID:
2864                         {
2865                                 char       *valstr = convert_string_datum(value, valuetypid);
2866                                 char       *lostr = convert_string_datum(lobound, boundstypid);
2867                                 char       *histr = convert_string_datum(hibound, boundstypid);
2868
2869                                 convert_string_to_scalar(valstr, scaledvalue,
2870                                                                                  lostr, scaledlobound,
2871                                                                                  histr, scaledhibound);
2872                                 pfree(valstr);
2873                                 pfree(lostr);
2874                                 pfree(histr);
2875                                 return true;
2876                         }
2877
2878                         /*
2879                          * Built-in bytea type
2880                          */
2881                 case BYTEAOID:
2882                         {
2883                                 convert_bytea_to_scalar(value, scaledvalue,
2884                                                                                 lobound, scaledlobound,
2885                                                                                 hibound, scaledhibound);
2886                                 return true;
2887                         }
2888
2889                         /*
2890                          * Built-in time types
2891                          */
2892                 case TIMESTAMPOID:
2893                 case TIMESTAMPTZOID:
2894                 case ABSTIMEOID:
2895                 case DATEOID:
2896                 case INTERVALOID:
2897                 case RELTIMEOID:
2898                 case TINTERVALOID:
2899                 case TIMEOID:
2900                 case TIMETZOID:
2901                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2902                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2903                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2904                         return true;
2905
2906                         /*
2907                          * Built-in network types
2908                          */
2909                 case INETOID:
2910                 case CIDROID:
2911                 case MACADDROID:
2912                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
2913                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2914                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2915                         return true;
2916         }
2917         /* Don't know how to convert */
2918         *scaledvalue = *scaledlobound = *scaledhibound = 0;
2919         return false;
2920 }
2921
2922 /*
2923  * Do convert_to_scalar()'s work for any numeric data type.
2924  */
2925 static double
2926 convert_numeric_to_scalar(Datum value, Oid typid)
2927 {
2928         switch (typid)
2929         {
2930                 case BOOLOID:
2931                         return (double) DatumGetBool(value);
2932                 case INT2OID:
2933                         return (double) DatumGetInt16(value);
2934                 case INT4OID:
2935                         return (double) DatumGetInt32(value);
2936                 case INT8OID:
2937                         return (double) DatumGetInt64(value);
2938                 case FLOAT4OID:
2939                         return (double) DatumGetFloat4(value);
2940                 case FLOAT8OID:
2941                         return (double) DatumGetFloat8(value);
2942                 case NUMERICOID:
2943                         /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2944                         return (double)
2945                                 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2946                                                                                                    value));
2947                 case OIDOID:
2948                 case REGPROCOID:
2949                 case REGPROCEDUREOID:
2950                 case REGOPEROID:
2951                 case REGOPERATOROID:
2952                 case REGCLASSOID:
2953                 case REGTYPEOID:
2954                         /* we can treat OIDs as integers... */
2955                         return (double) DatumGetObjectId(value);
2956         }
2957
2958         /*
2959          * Can't get here unless someone tries to use scalarltsel/scalargtsel on
2960          * an operator with one numeric and one non-numeric operand.
2961          */
2962         elog(ERROR, "unsupported type: %u", typid);
2963         return 0;
2964 }
2965
2966 /*
2967  * Do convert_to_scalar()'s work for any character-string data type.
2968  *
2969  * String datatypes are converted to a scale that ranges from 0 to 1,
2970  * where we visualize the bytes of the string as fractional digits.
2971  *
2972  * We do not want the base to be 256, however, since that tends to
2973  * generate inflated selectivity estimates; few databases will have
2974  * occurrences of all 256 possible byte values at each position.
2975  * Instead, use the smallest and largest byte values seen in the bounds
2976  * as the estimated range for each byte, after some fudging to deal with
2977  * the fact that we probably aren't going to see the full range that way.
2978  *
2979  * An additional refinement is that we discard any common prefix of the
2980  * three strings before computing the scaled values.  This allows us to
2981  * "zoom in" when we encounter a narrow data range.  An example is a phone
2982  * number database where all the values begin with the same area code.
2983  * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2984  * so this is more likely to happen than you might think.)
2985  */
2986 static void
2987 convert_string_to_scalar(char *value,
2988                                                  double *scaledvalue,
2989                                                  char *lobound,
2990                                                  double *scaledlobound,
2991                                                  char *hibound,
2992                                                  double *scaledhibound)
2993 {
2994         int                     rangelo,
2995                                 rangehi;
2996         char       *sptr;
2997
2998         rangelo = rangehi = (unsigned char) hibound[0];
2999         for (sptr = lobound; *sptr; sptr++)
3000         {
3001                 if (rangelo > (unsigned char) *sptr)
3002                         rangelo = (unsigned char) *sptr;
3003                 if (rangehi < (unsigned char) *sptr)
3004                         rangehi = (unsigned char) *sptr;
3005         }
3006         for (sptr = hibound; *sptr; sptr++)
3007         {
3008                 if (rangelo > (unsigned char) *sptr)
3009                         rangelo = (unsigned char) *sptr;
3010                 if (rangehi < (unsigned char) *sptr)
3011                         rangehi = (unsigned char) *sptr;
3012         }
3013         /* If range includes any upper-case ASCII chars, make it include all */
3014         if (rangelo <= 'Z' && rangehi >= 'A')
3015         {
3016                 if (rangelo > 'A')
3017                         rangelo = 'A';
3018                 if (rangehi < 'Z')
3019                         rangehi = 'Z';
3020         }
3021         /* Ditto lower-case */
3022         if (rangelo <= 'z' && rangehi >= 'a')
3023         {
3024                 if (rangelo > 'a')
3025                         rangelo = 'a';
3026                 if (rangehi < 'z')
3027                         rangehi = 'z';
3028         }
3029         /* Ditto digits */
3030         if (rangelo <= '9' && rangehi >= '0')
3031         {
3032                 if (rangelo > '0')
3033                         rangelo = '0';
3034                 if (rangehi < '9')
3035                         rangehi = '9';
3036         }
3037
3038         /*
3039          * If range includes less than 10 chars, assume we have not got enough
3040          * data, and make it include regular ASCII set.
3041          */
3042         if (rangehi - rangelo < 9)
3043         {
3044                 rangelo = ' ';
3045                 rangehi = 127;
3046         }
3047
3048         /*
3049          * Now strip any common prefix of the three strings.
3050          */
3051         while (*lobound)
3052         {
3053                 if (*lobound != *hibound || *lobound != *value)
3054                         break;
3055                 lobound++, hibound++, value++;
3056         }
3057
3058         /*
3059          * Now we can do the conversions.
3060          */
3061         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
3062         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
3063         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
3064 }
3065
3066 static double
3067 convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
3068 {
3069         int                     slen = strlen(value);
3070         double          num,
3071                                 denom,
3072                                 base;
3073
3074         if (slen <= 0)
3075                 return 0.0;                             /* empty string has scalar value 0 */
3076
3077         /*
3078          * Since base is at least 10, need not consider more than about 20 chars
3079          */
3080         if (slen > 20)
3081                 slen = 20;
3082
3083         /* Convert initial characters to fraction */
3084         base = rangehi - rangelo + 1;
3085         num = 0.0;
3086         denom = base;
3087         while (slen-- > 0)
3088         {
3089                 int                     ch = (unsigned char) *value++;
3090
3091                 if (ch < rangelo)
3092                         ch = rangelo - 1;
3093                 else if (ch > rangehi)
3094                         ch = rangehi + 1;
3095                 num += ((double) (ch - rangelo)) / denom;
3096                 denom *= base;
3097         }
3098
3099         return num;
3100 }
3101
3102 /*
3103  * Convert a string-type Datum into a palloc'd, null-terminated string.
3104  *
3105  * When using a non-C locale, we must pass the string through strxfrm()
3106  * before continuing, so as to generate correct locale-specific results.
3107  */
3108 static char *
3109 convert_string_datum(Datum value, Oid typid)
3110 {
3111         char       *val;
3112
3113         switch (typid)
3114         {
3115                 case CHAROID:
3116                         val = (char *) palloc(2);
3117                         val[0] = DatumGetChar(value);
3118                         val[1] = '\0';
3119                         break;
3120                 case BPCHAROID:
3121                 case VARCHAROID:
3122                 case TEXTOID:
3123                         {
3124                                 char       *str = (char *) VARDATA(DatumGetPointer(value));
3125                                 int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
3126
3127                                 val = (char *) palloc(strlength + 1);
3128                                 memcpy(val, str, strlength);
3129                                 val[strlength] = '\0';
3130                                 break;
3131                         }
3132                 case NAMEOID:
3133                         {
3134                                 NameData   *nm = (NameData *) DatumGetPointer(value);
3135
3136                                 val = pstrdup(NameStr(*nm));
3137                                 break;
3138                         }
3139                 default:
3140
3141                         /*
3142                          * Can't get here unless someone tries to use scalarltsel on an
3143                          * operator with one string and one non-string operand.
3144                          */
3145                         elog(ERROR, "unsupported type: %u", typid);
3146                         return NULL;
3147         }
3148
3149         if (!lc_collate_is_c())
3150         {
3151                 char       *xfrmstr;
3152                 size_t          xfrmlen;
3153                 size_t          xfrmlen2;
3154
3155                 /*
3156                  * Note: originally we guessed at a suitable output buffer size, and
3157                  * only needed to call strxfrm twice if our guess was too small.
3158                  * However, it seems that some versions of Solaris have buggy strxfrm
3159                  * that can write past the specified buffer length in that scenario.
3160                  * So, do it the dumb way for portability.
3161                  *
3162                  * Yet other systems (e.g., glibc) sometimes return a smaller value
3163                  * from the second call than the first; thus the Assert must be <= not
3164                  * == as you'd expect.  Can't any of these people program their way
3165                  * out of a paper bag?
3166                  */
3167 #if _MSC_VER == 1400                    /* VS.Net 2005 */
3168
3169                 /*
3170                  * http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx
3171                  * ?FeedbackID=99694
3172                  */
3173                 {
3174                         char            x[1];
3175
3176                         xfrmlen = strxfrm(x, val, 0);
3177                 }
3178 #else
3179                 xfrmlen = strxfrm(NULL, val, 0);
3180 #endif
3181                 xfrmstr = (char *) palloc(xfrmlen + 1);
3182                 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
3183                 Assert(xfrmlen2 <= xfrmlen);
3184                 pfree(val);
3185                 val = xfrmstr;
3186         }
3187
3188         return val;
3189 }
3190
3191 /*
3192  * Do convert_to_scalar()'s work for any bytea data type.
3193  *
3194  * Very similar to convert_string_to_scalar except we can't assume
3195  * null-termination and therefore pass explicit lengths around.
3196  *
3197  * Also, assumptions about likely "normal" ranges of characters have been
3198  * removed - a data range of 0..255 is always used, for now.  (Perhaps
3199  * someday we will add information about actual byte data range to
3200  * pg_statistic.)
3201  */
3202 static void
3203 convert_bytea_to_scalar(Datum value,
3204                                                 double *scaledvalue,
3205                                                 Datum lobound,
3206                                                 double *scaledlobound,
3207                                                 Datum hibound,
3208                                                 double *scaledhibound)
3209 {
3210         int                     rangelo,
3211                                 rangehi,
3212                                 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
3213                                 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
3214                                 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
3215                                 i,
3216                                 minlen;
3217         unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
3218                            *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
3219                            *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
3220
3221         /*
3222          * Assume bytea data is uniformly distributed across all byte values.
3223          */
3224         rangelo = 0;
3225         rangehi = 255;
3226
3227         /*
3228          * Now strip any common prefix of the three strings.
3229          */
3230         minlen = Min(Min(valuelen, loboundlen), hiboundlen);
3231         for (i = 0; i < minlen; i++)
3232         {
3233                 if (*lostr != *histr || *lostr != *valstr)
3234                         break;
3235                 lostr++, histr++, valstr++;
3236                 loboundlen--, hiboundlen--, valuelen--;
3237         }
3238
3239         /*
3240          * Now we can do the conversions.
3241          */
3242         *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
3243         *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
3244         *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
3245 }
3246
3247 static double
3248 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
3249                                                         int rangelo, int rangehi)
3250 {
3251         double          num,
3252                                 denom,
3253                                 base;
3254
3255         if (valuelen <= 0)
3256                 return 0.0;                             /* empty string has scalar value 0 */
3257
3258         /*
3259          * Since base is 256, need not consider more than about 10 chars (even
3260          * this many seems like overkill)
3261          */
3262         if (valuelen > 10)
3263                 valuelen = 10;
3264
3265         /* Convert initial characters to fraction */
3266         base = rangehi - rangelo + 1;
3267         num = 0.0;
3268         denom = base;
3269         while (valuelen-- > 0)
3270         {
3271                 int                     ch = *value++;
3272
3273                 if (ch < rangelo)
3274                         ch = rangelo - 1;
3275                 else if (ch > rangehi)
3276                         ch = rangehi + 1;
3277                 num += ((double) (ch - rangelo)) / denom;
3278                 denom *= base;
3279         }
3280
3281         return num;
3282 }
3283
3284 /*
3285  * Do convert_to_scalar()'s work for any timevalue data type.
3286  */
3287 static double
3288 convert_timevalue_to_scalar(Datum value, Oid typid)
3289 {
3290         switch (typid)
3291         {
3292                 case TIMESTAMPOID:
3293                         return DatumGetTimestamp(value);
3294                 case TIMESTAMPTZOID:
3295                         return DatumGetTimestampTz(value);
3296                 case ABSTIMEOID:
3297                         return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
3298                                                                                                                  value));
3299                 case DATEOID:
3300                         return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
3301                                                                                                                  value));
3302                 case INTERVALOID:
3303                         {
3304                                 Interval   *interval = DatumGetIntervalP(value);
3305
3306                                 /*
3307                                  * Convert the month part of Interval to days using assumed
3308                                  * average month length of 365.25/12.0 days.  Not too
3309                                  * accurate, but plenty good enough for our purposes.
3310                                  */
3311 #ifdef HAVE_INT64_TIMESTAMP
3312                                 return interval->time + interval->day * (double) USECS_PER_DAY +
3313                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
3314 #else
3315                                 return interval->time + interval->day * SECS_PER_DAY +
3316                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
3317 #endif
3318                         }
3319                 case RELTIMEOID:
3320 #ifdef HAVE_INT64_TIMESTAMP
3321                         return (DatumGetRelativeTime(value) * 1000000.0);
3322 #else
3323                         return DatumGetRelativeTime(value);
3324 #endif
3325                 case TINTERVALOID:
3326                         {
3327                                 TimeInterval tinterval = DatumGetTimeInterval(value);
3328
3329 #ifdef HAVE_INT64_TIMESTAMP
3330                                 if (tinterval->status != 0)
3331                                         return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
3332 #else
3333                                 if (tinterval->status != 0)
3334                                         return tinterval->data[1] - tinterval->data[0];
3335 #endif
3336                                 return 0;               /* for lack of a better idea */
3337                         }
3338                 case TIMEOID:
3339                         return DatumGetTimeADT(value);
3340                 case TIMETZOID:
3341                         {
3342                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
3343
3344                                 /* use GMT-equivalent time */
3345 #ifdef HAVE_INT64_TIMESTAMP
3346                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
3347 #else
3348                                 return (double) (timetz->time + timetz->zone);
3349 #endif
3350                         }
3351         }
3352
3353         /*
3354          * Can't get here unless someone tries to use scalarltsel/scalargtsel on
3355          * an operator with one timevalue and one non-timevalue operand.
3356          */
3357         elog(ERROR, "unsupported type: %u", typid);
3358         return 0;
3359 }
3360
3361
3362 /*
3363  * get_restriction_variable
3364  *              Examine the args of a restriction clause to see if it's of the
3365  *              form (variable op pseudoconstant) or (pseudoconstant op variable),
3366  *              where "variable" could be either a Var or an expression in vars of a
3367  *              single relation.  If so, extract information about the variable,
3368  *              and also indicate which side it was on and the other argument.
3369  *
3370  * Inputs:
3371  *      root: the planner info
3372  *      args: clause argument list
3373  *      varRelid: see specs for restriction selectivity functions
3374  *
3375  * Outputs: (these are valid only if TRUE is returned)
3376  *      *vardata: gets information about variable (see examine_variable)
3377  *      *other: gets other clause argument, aggressively reduced to a constant
3378  *      *varonleft: set TRUE if variable is on the left, FALSE if on the right
3379  *
3380  * Returns TRUE if a variable is identified, otherwise FALSE.
3381  *
3382  * Note: if there are Vars on both sides of the clause, we must fail, because
3383  * callers are expecting that the other side will act like a pseudoconstant.
3384  */
3385 bool
3386 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
3387                                                  VariableStatData *vardata, Node **other,
3388                                                  bool *varonleft)
3389 {
3390         Node       *left,
3391                            *right;
3392         VariableStatData rdata;
3393
3394         /* Fail if not a binary opclause (probably shouldn't happen) */
3395         if (list_length(args) != 2)
3396                 return false;
3397
3398         left = (Node *) linitial(args);
3399         right = (Node *) lsecond(args);
3400
3401         /*
3402          * Examine both sides.  Note that when varRelid is nonzero, Vars of other
3403          * relations will be treated as pseudoconstants.
3404          */
3405         examine_variable(root, left, varRelid, vardata);
3406         examine_variable(root, right, varRelid, &rdata);
3407
3408         /*
3409          * If one side is a variable and the other not, we win.
3410          */
3411         if (vardata->rel && rdata.rel == NULL)
3412         {
3413                 *varonleft = true;
3414                 *other = estimate_expression_value(root, rdata.var);
3415                 /* Assume we need no ReleaseVariableStats(rdata) here */
3416                 return true;
3417         }
3418
3419         if (vardata->rel == NULL && rdata.rel)
3420         {
3421                 *varonleft = false;
3422                 *other = estimate_expression_value(root, vardata->var);
3423                 /* Assume we need no ReleaseVariableStats(*vardata) here */
3424                 *vardata = rdata;
3425                 return true;
3426         }
3427
3428         /* Ooops, clause has wrong structure (probably var op var) */
3429         ReleaseVariableStats(*vardata);
3430         ReleaseVariableStats(rdata);
3431
3432         return false;
3433 }
3434
3435 /*
3436  * get_join_variables
3437  *              Apply examine_variable() to each side of a join clause.
3438  */
3439 void
3440 get_join_variables(PlannerInfo *root, List *args,
3441                                    VariableStatData *vardata1, VariableStatData *vardata2)
3442 {
3443         Node       *left,
3444                            *right;
3445
3446         if (list_length(args) != 2)
3447                 elog(ERROR, "join operator should take two arguments");
3448
3449         left = (Node *) linitial(args);
3450         right = (Node *) lsecond(args);
3451
3452         examine_variable(root, left, 0, vardata1);
3453         examine_variable(root, right, 0, vardata2);
3454 }
3455
3456 /*
3457  * examine_variable
3458  *              Try to look up statistical data about an expression.
3459  *              Fill in a VariableStatData struct to describe the expression.
3460  *
3461  * Inputs:
3462  *      root: the planner info
3463  *      node: the expression tree to examine
3464  *      varRelid: see specs for restriction selectivity functions
3465  *
3466  * Outputs: *vardata is filled as follows:
3467  *      var: the input expression (with any binary relabeling stripped, if
3468  *              it is or contains a variable; but otherwise the type is preserved)
3469  *      rel: RelOptInfo for relation containing variable; NULL if expression
3470  *              contains no Vars (NOTE this could point to a RelOptInfo of a
3471  *              subquery, not one in the current query).
3472  *      statsTuple: the pg_statistic entry for the variable, if one exists;
3473  *              otherwise NULL.
3474  *      vartype: exposed type of the expression; this should always match
3475  *              the declared input type of the operator we are estimating for.
3476  *      atttype, atttypmod: type data to pass to get_attstatsslot().  This is
3477  *              commonly the same as the exposed type of the variable argument,
3478  *              but can be different in binary-compatible-type cases.
3479  *
3480  * Caller is responsible for doing ReleaseVariableStats() before exiting.
3481  */
3482 void
3483 examine_variable(PlannerInfo *root, Node *node, int varRelid,
3484                                  VariableStatData *vardata)
3485 {
3486         Node       *basenode;
3487         Relids          varnos;
3488         RelOptInfo *onerel;
3489
3490         /* Make sure we don't return dangling pointers in vardata */
3491         MemSet(vardata, 0, sizeof(VariableStatData));
3492
3493         /* Save the exposed type of the expression */
3494         vardata->vartype = exprType(node);
3495
3496         /* Look inside any binary-compatible relabeling */
3497
3498         if (IsA(node, RelabelType))
3499                 basenode = (Node *) ((RelabelType *) node)->arg;
3500         else
3501                 basenode = node;
3502
3503         /* Fast path for a simple Var */
3504
3505         if (IsA(basenode, Var) &&
3506                 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
3507         {
3508                 Var                *var = (Var *) basenode;
3509                 RangeTblEntry *rte;
3510
3511                 vardata->var = basenode;        /* return Var without relabeling */
3512                 vardata->rel = find_base_rel(root, var->varno);
3513                 vardata->atttype = var->vartype;
3514                 vardata->atttypmod = var->vartypmod;
3515
3516                 rte = rt_fetch(var->varno, root->parse->rtable);
3517
3518                 if (rte->inh)
3519                 {
3520                         /*
3521                          * XXX This means the Var represents a column of an append
3522                          * relation. Later add code to look at the member relations and
3523                          * try to derive some kind of combined statistics?
3524                          */
3525                 }
3526                 else if (rte->rtekind == RTE_RELATION)
3527                 {
3528                         vardata->statsTuple = SearchSysCache(STATRELATT,
3529                                                                                                  ObjectIdGetDatum(rte->relid),
3530                                                                                                  Int16GetDatum(var->varattno),
3531                                                                                                  0, 0);
3532                 }
3533                 else
3534                 {
3535                         /*
3536                          * XXX This means the Var comes from a JOIN or sub-SELECT. Later
3537                          * add code to dig down into the join etc and see if we can trace
3538                          * the variable to something with stats.  (But beware of
3539                          * sub-SELECTs with DISTINCT/GROUP BY/etc.      Perhaps there are no
3540                          * cases where this would really be useful, because we'd have
3541                          * flattened the subselect if it is??)
3542                          */
3543                 }
3544
3545                 return;
3546         }
3547
3548         /*
3549          * Okay, it's a more complicated expression.  Determine variable
3550          * membership.  Note that when varRelid isn't zero, only vars of that
3551          * relation are considered "real" vars.
3552          */
3553         varnos = pull_varnos(basenode);
3554
3555         onerel = NULL;
3556
3557         switch (bms_membership(varnos))
3558         {
3559                 case BMS_EMPTY_SET:
3560                         /* No Vars at all ... must be pseudo-constant clause */
3561                         break;
3562                 case BMS_SINGLETON:
3563                         if (varRelid == 0 || bms_is_member(varRelid, varnos))
3564                         {
3565                                 onerel = find_base_rel(root,
3566                                            (varRelid ? varRelid : bms_singleton_member(varnos)));
3567                                 vardata->rel = onerel;
3568                                 node = basenode;        /* strip any relabeling */
3569                         }
3570                         /* else treat it as a constant */
3571                         break;
3572                 case BMS_MULTIPLE:
3573                         if (varRelid == 0)
3574                         {
3575                                 /* treat it as a variable of a join relation */
3576                                 vardata->rel = find_join_rel(root, varnos);
3577                                 node = basenode;        /* strip any relabeling */
3578                         }
3579                         else if (bms_is_member(varRelid, varnos))
3580                         {
3581                                 /* ignore the vars belonging to other relations */
3582                                 vardata->rel = find_base_rel(root, varRelid);
3583                                 node = basenode;        /* strip any relabeling */
3584                                 /* note: no point in expressional-index search here */
3585                         }
3586                         /* else treat it as a constant */
3587                         break;
3588         }
3589
3590         bms_free(varnos);
3591
3592         vardata->var = node;
3593         vardata->atttype = exprType(node);
3594         vardata->atttypmod = exprTypmod(node);
3595
3596         if (onerel)
3597         {
3598                 /*
3599                  * We have an expression in vars of a single relation.  Try to match
3600                  * it to expressional index columns, in hopes of finding some
3601                  * statistics.
3602                  *
3603                  * XXX it's conceivable that there are multiple matches with different
3604                  * index opfamilies; if so, we need to pick one that matches the
3605                  * operator we are estimating for.      FIXME later.
3606                  */
3607                 ListCell   *ilist;
3608
3609                 foreach(ilist, onerel->indexlist)
3610                 {
3611                         IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3612                         ListCell   *indexpr_item;
3613                         int                     pos;
3614
3615                         indexpr_item = list_head(index->indexprs);
3616                         if (indexpr_item == NULL)
3617                                 continue;               /* no expressions here... */
3618
3619                         /*
3620                          * Ignore partial indexes since they probably don't reflect
3621                          * whole-relation statistics.  Possibly reconsider this later.
3622                          */
3623                         if (index->indpred)
3624                                 continue;
3625
3626                         for (pos = 0; pos < index->ncolumns; pos++)
3627                         {
3628                                 if (index->indexkeys[pos] == 0)
3629                                 {
3630                                         Node       *indexkey;
3631
3632                                         if (indexpr_item == NULL)
3633                                                 elog(ERROR, "too few entries in indexprs list");
3634                                         indexkey = (Node *) lfirst(indexpr_item);
3635                                         if (indexkey && IsA(indexkey, RelabelType))
3636                                                 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3637                                         if (equal(node, indexkey))
3638                                         {
3639                                                 /*
3640                                                  * Found a match ... is it a unique index? Tests here
3641                                                  * should match has_unique_index().
3642                                                  */
3643                                                 if (index->unique &&
3644                                                         index->ncolumns == 1 &&
3645                                                         index->indpred == NIL)
3646                                                         vardata->isunique = true;
3647                                                 /* Has it got stats? */
3648                                                 vardata->statsTuple = SearchSysCache(STATRELATT,
3649                                                                                    ObjectIdGetDatum(index->indexoid),
3650                                                                                                           Int16GetDatum(pos + 1),
3651                                                                                                                          0, 0);
3652                                                 if (vardata->statsTuple)
3653                                                         break;
3654                                         }
3655                                         indexpr_item = lnext(indexpr_item);
3656                                 }
3657                         }
3658                         if (vardata->statsTuple)
3659                                 break;
3660                 }
3661         }
3662 }
3663
3664 /*
3665  * get_variable_numdistinct
3666  *        Estimate the number of distinct values of a variable.
3667  *
3668  * vardata: results of examine_variable
3669  *
3670  * NB: be careful to produce an integral result, since callers may compare
3671  * the result to exact integer counts.
3672  */
3673 double
3674 get_variable_numdistinct(VariableStatData *vardata)
3675 {
3676         double          stadistinct;
3677         double          ntuples;
3678
3679         /*
3680          * Determine the stadistinct value to use.      There are cases where we can
3681          * get an estimate even without a pg_statistic entry, or can get a better
3682          * value than is in pg_statistic.
3683          */
3684         if (HeapTupleIsValid(vardata->statsTuple))
3685         {
3686                 /* Use the pg_statistic entry */
3687                 Form_pg_statistic stats;
3688
3689                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3690                 stadistinct = stats->stadistinct;
3691         }
3692         else if (vardata->vartype == BOOLOID)
3693         {
3694                 /*
3695                  * Special-case boolean columns: presumably, two distinct values.
3696                  *
3697                  * Are there any other datatypes we should wire in special estimates
3698                  * for?
3699                  */
3700                 stadistinct = 2.0;
3701         }
3702         else
3703         {
3704                 /*
3705                  * We don't keep statistics for system columns, but in some cases we
3706                  * can infer distinctness anyway.
3707                  */
3708                 if (vardata->var && IsA(vardata->var, Var))
3709                 {
3710                         switch (((Var *) vardata->var)->varattno)
3711                         {
3712                                 case ObjectIdAttributeNumber:
3713                                 case SelfItemPointerAttributeNumber:
3714                                         stadistinct = -1.0; /* unique */
3715                                         break;
3716                                 case TableOidAttributeNumber:
3717                                         stadistinct = 1.0;      /* only 1 value */
3718                                         break;
3719                                 default:
3720                                         stadistinct = 0.0;      /* means "unknown" */
3721                                         break;
3722                         }
3723                 }
3724                 else
3725                         stadistinct = 0.0;      /* means "unknown" */
3726
3727                 /*
3728                  * XXX consider using estimate_num_groups on expressions?
3729                  */
3730         }
3731
3732         /*
3733          * If there is a unique index for the variable, assume it is unique no
3734          * matter what pg_statistic says (the statistics could be out of date).
3735          * Can skip search if we already think it's unique.
3736          */
3737         if (stadistinct != -1.0)
3738         {
3739                 if (vardata->isunique)
3740                         stadistinct = -1.0;
3741                 else if (vardata->var && IsA(vardata->var, Var) &&
3742                                  vardata->rel &&
3743                                  has_unique_index(vardata->rel,
3744                                                                   ((Var *) vardata->var)->varattno))
3745                         stadistinct = -1.0;
3746         }
3747
3748         /*
3749          * If we had an absolute estimate, use that.
3750          */
3751         if (stadistinct > 0.0)
3752                 return stadistinct;
3753
3754         /*
3755          * Otherwise we need to get the relation size; punt if not available.
3756          */
3757         if (vardata->rel == NULL)
3758                 return DEFAULT_NUM_DISTINCT;
3759         ntuples = vardata->rel->tuples;
3760         if (ntuples <= 0.0)
3761                 return DEFAULT_NUM_DISTINCT;
3762
3763         /*
3764          * If we had a relative estimate, use that.
3765          */
3766         if (stadistinct < 0.0)
3767                 return floor((-stadistinct * ntuples) + 0.5);
3768
3769         /*
3770          * With no data, estimate ndistinct = ntuples if the table is small, else
3771          * use default.
3772          */
3773         if (ntuples < DEFAULT_NUM_DISTINCT)
3774                 return ntuples;
3775
3776         return DEFAULT_NUM_DISTINCT;
3777 }
3778
3779 /*
3780  * get_variable_maximum
3781  *              Estimate the maximum value of the specified variable.
3782  *              If successful, store value in *max and return TRUE.
3783  *              If no data available, return FALSE.
3784  *
3785  * sortop is the "<" comparison operator to use.  (To extract the
3786  * minimum instead of the maximum, just pass the ">" operator instead.)
3787  */
3788 static bool
3789 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3790                                          Oid sortop, Datum *max)
3791 {
3792         Datum           tmax = 0;
3793         bool            have_max = false;
3794         Form_pg_statistic stats;
3795         int16           typLen;
3796         bool            typByVal;
3797         Datum      *values;
3798         int                     nvalues;
3799         int                     i;
3800
3801         if (!HeapTupleIsValid(vardata->statsTuple))
3802         {
3803                 /* no stats available, so default result */
3804                 return false;
3805         }
3806         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3807
3808         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3809
3810         /*
3811          * If there is a histogram, grab the last or first value as appropriate.
3812          *
3813          * If there is a histogram that is sorted with some other operator than
3814          * the one we want, fail --- this suggests that there is data we can't
3815          * use.
3816          */
3817         if (get_attstatsslot(vardata->statsTuple,
3818                                                  vardata->atttype, vardata->atttypmod,
3819                                                  STATISTIC_KIND_HISTOGRAM, sortop,
3820                                                  &values, &nvalues,
3821                                                  NULL, NULL))
3822         {
3823                 if (nvalues > 0)
3824                 {
3825                         tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3826                         have_max = true;
3827                 }
3828                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3829         }
3830         else
3831         {
3832                 Oid                     rsortop = get_commutator(sortop);
3833
3834                 if (OidIsValid(rsortop) &&
3835                         get_attstatsslot(vardata->statsTuple,
3836                                                          vardata->atttype, vardata->atttypmod,
3837                                                          STATISTIC_KIND_HISTOGRAM, rsortop,
3838                                                          &values, &nvalues,
3839                                                          NULL, NULL))
3840                 {
3841                         if (nvalues > 0)
3842                         {
3843                                 tmax = datumCopy(values[0], typByVal, typLen);
3844                                 have_max = true;
3845                         }
3846                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3847                 }
3848                 else if (get_attstatsslot(vardata->statsTuple,
3849                                                                   vardata->atttype, vardata->atttypmod,
3850                                                                   STATISTIC_KIND_HISTOGRAM, InvalidOid,
3851                                                                   &values, &nvalues,
3852                                                                   NULL, NULL))
3853                 {
3854                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3855                         return false;
3856                 }
3857         }
3858
3859         /*
3860          * If we have most-common-values info, look for a large MCV.  This is
3861          * needed even if we also have a histogram, since the histogram excludes
3862          * the MCVs.  However, usually the MCVs will not be the extreme values, so
3863          * avoid unnecessary data copying.
3864          */
3865         if (get_attstatsslot(vardata->statsTuple,
3866                                                  vardata->atttype, vardata->atttypmod,
3867                                                  STATISTIC_KIND_MCV, InvalidOid,
3868                                                  &values, &nvalues,
3869                                                  NULL, NULL))
3870         {
3871                 bool            large_mcv = false;
3872                 FmgrInfo        opproc;
3873
3874                 fmgr_info(get_opcode(sortop), &opproc);
3875
3876                 for (i = 0; i < nvalues; i++)
3877                 {
3878                         if (!have_max)
3879                         {
3880                                 tmax = values[i];
3881                                 large_mcv = have_max = true;
3882                         }
3883                         else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3884                         {
3885                                 tmax = values[i];
3886                                 large_mcv = true;
3887                         }
3888                 }
3889                 if (large_mcv)
3890                         tmax = datumCopy(tmax, typByVal, typLen);
3891                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3892         }
3893
3894         *max = tmax;
3895         return have_max;
3896 }
3897
3898
3899 /*-------------------------------------------------------------------------
3900  *
3901  * Pattern analysis functions
3902  *
3903  * These routines support analysis of LIKE and regular-expression patterns
3904  * by the planner/optimizer.  It's important that they agree with the
3905  * regular-expression code in backend/regex/ and the LIKE code in
3906  * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
3907  * must be conservative: if we report a string longer than the true fixed
3908  * prefix, the query may produce actually wrong answers, rather than just
3909  * getting a bad selectivity estimate!
3910  *
3911  * Note that the prefix-analysis functions are called from
3912  * backend/optimizer/path/indxpath.c as well as from routines in this file.
3913  *
3914  *-------------------------------------------------------------------------
3915  */
3916
3917 /*
3918  * Extract the fixed prefix, if any, for a pattern.
3919  *
3920  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3921  *      or to NULL if no fixed prefix exists for the pattern.
3922  * *rest is set to a palloc'd Const representing the remainder of the pattern
3923  *      after the portion describing the fixed prefix.
3924  * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3925  *
3926  * The return value distinguishes no fixed prefix, a partial prefix,
3927  * or an exact-match-only pattern.
3928  */
3929
3930 static Pattern_Prefix_Status
3931 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3932                                   Const **prefix_const, Const **rest_const)
3933 {
3934         char       *match;
3935         char       *patt;
3936         int                     pattlen;
3937         char       *rest;
3938         Oid                     typeid = patt_const->consttype;
3939         int                     pos,
3940                                 match_pos;
3941         bool            is_multibyte = (pg_database_encoding_max_length() > 1);
3942
3943         /* the right-hand const is type text or bytea */
3944         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3945
3946         if (typeid == BYTEAOID && case_insensitive)
3947                 ereport(ERROR,
3948                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3949                    errmsg("case insensitive matching not supported on type bytea")));
3950
3951         if (typeid != BYTEAOID)
3952         {
3953                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3954                 pattlen = strlen(patt);
3955         }
3956         else
3957         {
3958                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
3959
3960                 pattlen = VARSIZE(bstr) - VARHDRSZ;
3961                 patt = (char *) palloc(pattlen);
3962                 memcpy(patt, VARDATA(bstr), pattlen);
3963                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3964                         pfree(bstr);
3965         }
3966
3967         match = palloc(pattlen + 1);
3968         match_pos = 0;
3969         for (pos = 0; pos < pattlen; pos++)
3970         {
3971                 /* % and _ are wildcard characters in LIKE */
3972                 if (patt[pos] == '%' ||
3973                         patt[pos] == '_')
3974                         break;
3975
3976                 /* Backslash escapes the next character */
3977                 if (patt[pos] == '\\')
3978                 {
3979                         pos++;
3980                         if (pos >= pattlen)
3981                                 break;
3982                 }
3983
3984                 /*
3985                  * XXX In multibyte character sets, we can't trust isalpha, so assume
3986                  * any multibyte char is potentially case-varying.
3987                  */
3988                 if (case_insensitive)
3989                 {
3990                         if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
3991                                 break;
3992                         if (isalpha((unsigned char) patt[pos]))
3993                                 break;
3994                 }
3995
3996                 /*
3997                  * NOTE: this code used to think that %% meant a literal %, but
3998                  * textlike() itself does not think that, and the SQL92 spec doesn't
3999                  * say any such thing either.
4000                  */
4001                 match[match_pos++] = patt[pos];
4002         }
4003
4004         match[match_pos] = '\0';
4005         rest = &patt[pos];
4006
4007         if (typeid != BYTEAOID)
4008         {
4009                 *prefix_const = string_to_const(match, typeid);
4010                 *rest_const = string_to_const(rest, typeid);
4011         }
4012         else
4013         {
4014                 *prefix_const = string_to_bytea_const(match, match_pos);
4015                 *rest_const = string_to_bytea_const(rest, pattlen - pos);
4016         }
4017
4018         pfree(patt);
4019         pfree(match);
4020
4021         /* in LIKE, an empty pattern is an exact match! */
4022         if (pos == pattlen)
4023                 return Pattern_Prefix_Exact;    /* reached end of pattern, so exact */
4024
4025         if (match_pos > 0)
4026                 return Pattern_Prefix_Partial;
4027
4028         return Pattern_Prefix_None;
4029 }
4030
4031 static Pattern_Prefix_Status
4032 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
4033                                    Const **prefix_const, Const **rest_const)
4034 {
4035         char       *match;
4036         int                     pos,
4037                                 match_pos,
4038                                 prev_pos,
4039                                 prev_match_pos;
4040         bool            have_leading_paren;
4041         char       *patt;
4042         char       *rest;
4043         Oid                     typeid = patt_const->consttype;
4044         bool            is_basic = regex_flavor_is_basic();
4045         bool            is_multibyte = (pg_database_encoding_max_length() > 1);
4046
4047         /*
4048          * Should be unnecessary, there are no bytea regex operators defined. As
4049          * such, it should be noted that the rest of this function has *not* been
4050          * made safe for binary (possibly NULL containing) strings.
4051          */
4052         if (typeid == BYTEAOID)
4053                 ereport(ERROR,
4054                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4055                  errmsg("regular-expression matching not supported on type bytea")));
4056
4057         /* the right-hand const is type text for all of these */
4058         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4059
4060         /*
4061          * Check for ARE director prefix.  It's worth our trouble to recognize
4062          * this because similar_escape() uses it.
4063          */
4064         pos = 0;
4065         if (strncmp(patt, "***:", 4) == 0)
4066         {
4067                 pos = 4;
4068                 is_basic = false;
4069         }
4070
4071         /* Pattern must be anchored left */
4072         if (patt[pos] != '^')
4073         {
4074                 rest = patt;
4075
4076                 *prefix_const = NULL;
4077                 *rest_const = string_to_const(rest, typeid);
4078
4079                 return Pattern_Prefix_None;
4080         }
4081         pos++;
4082
4083         /*
4084          * If '|' is present in pattern, then there may be multiple alternatives
4085          * for the start of the string.  (There are cases where this isn't so,
4086          * for instance if the '|' is inside parens, but detecting that reliably
4087          * is too hard.)
4088          */
4089         if (strchr(patt + pos, '|') != NULL)
4090         {
4091                 rest = patt;
4092
4093                 *prefix_const = NULL;
4094                 *rest_const = string_to_const(rest, typeid);
4095
4096                 return Pattern_Prefix_None;
4097         }
4098
4099         /* OK, allocate space for pattern */
4100         match = palloc(strlen(patt) + 1);
4101         prev_match_pos = match_pos = 0;
4102
4103         /*
4104          * We special-case the syntax '^(...)$' because psql uses it.  But beware:
4105          * in BRE mode these parentheses are just ordinary characters.  Also,
4106          * sequences beginning "(?" are not what they seem, unless they're "(?:".
4107          * (We should recognize that, too, because of similar_escape().)
4108          *
4109          * Note: it's a bit bogus to be depending on the current regex_flavor
4110          * setting here, because the setting could change before the pattern is
4111          * used.  We minimize the risk by trusting the flavor as little as we can,
4112          * but perhaps it would be a good idea to get rid of the "basic" setting.
4113          */
4114         have_leading_paren = false;
4115         if (patt[pos] == '(' && !is_basic &&
4116                 (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
4117         {
4118                 have_leading_paren = true;
4119                 pos += (patt[pos + 1] != '?' ? 1 : 3);
4120         }
4121
4122         /* Scan remainder of pattern */
4123         prev_pos = pos;
4124         while (patt[pos])
4125         {
4126                 int                     len;
4127
4128                 /*
4129                  * Check for characters that indicate multiple possible matches here.
4130                  * Also, drop out at ')' or '$' so the termination test works right.
4131                  */
4132                 if (patt[pos] == '.' ||
4133                         patt[pos] == '(' ||
4134                         patt[pos] == ')' ||
4135                         patt[pos] == '[' ||
4136                         patt[pos] == '^' ||
4137                         patt[pos] == '$')
4138                         break;
4139
4140                 /*
4141                  * XXX In multibyte character sets, we can't trust isalpha, so assume
4142                  * any multibyte char is potentially case-varying.
4143                  */
4144                 if (case_insensitive)
4145                 {
4146                         if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
4147                                 break;
4148                         if (isalpha((unsigned char) patt[pos]))
4149                                 break;
4150                 }
4151
4152                 /*
4153                  * Check for quantifiers.  Except for +, this means the preceding
4154                  * character is optional, so we must remove it from the prefix too!
4155                  * Note: in BREs, \{ is a quantifier.
4156                  */
4157                 if (patt[pos] == '*' ||
4158                         patt[pos] == '?' ||
4159                         patt[pos] == '{' ||
4160                         (patt[pos] == '\\' && patt[pos + 1] == '{'))
4161                 {
4162                         match_pos = prev_match_pos;
4163                         pos = prev_pos;
4164                         break;
4165                 }
4166                 if (patt[pos] == '+')
4167                 {
4168                         pos = prev_pos;
4169                         break;
4170                 }
4171
4172                 /*
4173                  * Normally, backslash quotes the next character.  But in AREs,
4174                  * backslash followed by alphanumeric is an escape, not a quoted
4175                  * character.  Must treat it as having multiple possible matches.
4176                  * In BREs, \( is a parenthesis, so don't trust that either.
4177                  * Note: since only ASCII alphanumerics are escapes, we don't have
4178                  * to be paranoid about multibyte here.
4179                  */
4180                 if (patt[pos] == '\\')
4181                 {
4182                         if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
4183                                 break;
4184                         pos++;
4185                         if (patt[pos] == '\0')
4186                                 break;
4187                 }
4188                 /* save position in case we need to back up on next loop cycle */
4189                 prev_match_pos = match_pos;
4190                 prev_pos = pos;
4191                 /* must use encoding-aware processing here */
4192                 len = pg_mblen(&patt[pos]);
4193                 memcpy(&match[match_pos], &patt[pos], len);
4194                 match_pos += len;
4195                 pos += len;
4196         }
4197
4198         match[match_pos] = '\0';
4199         rest = &patt[pos];
4200
4201         if (have_leading_paren && patt[pos] == ')')
4202                 pos++;
4203
4204         if (patt[pos] == '$' && patt[pos + 1] == '\0')
4205         {
4206                 rest = &patt[pos + 1];
4207
4208                 *prefix_const = string_to_const(match, typeid);
4209                 *rest_const = string_to_const(rest, typeid);
4210
4211                 pfree(patt);
4212                 pfree(match);
4213
4214                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
4215         }
4216
4217         *prefix_const = string_to_const(match, typeid);
4218         *rest_const = string_to_const(rest, typeid);
4219
4220         pfree(patt);
4221         pfree(match);
4222
4223         if (match_pos > 0)
4224                 return Pattern_Prefix_Partial;
4225
4226         return Pattern_Prefix_None;
4227 }
4228
4229 Pattern_Prefix_Status
4230 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
4231                                          Const **prefix, Const **rest)
4232 {
4233         Pattern_Prefix_Status result;
4234
4235         switch (ptype)
4236         {
4237                 case Pattern_Type_Like:
4238                         result = like_fixed_prefix(patt, false, prefix, rest);
4239                         break;
4240                 case Pattern_Type_Like_IC:
4241                         result = like_fixed_prefix(patt, true, prefix, rest);
4242                         break;
4243                 case Pattern_Type_Regex:
4244                         result = regex_fixed_prefix(patt, false, prefix, rest);
4245                         break;
4246                 case Pattern_Type_Regex_IC:
4247                         result = regex_fixed_prefix(patt, true, prefix, rest);
4248                         break;
4249                 default:
4250                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4251                         result = Pattern_Prefix_None;           /* keep compiler quiet */
4252                         break;
4253         }
4254         return result;
4255 }
4256
4257 /*
4258  * Estimate the selectivity of a fixed prefix for a pattern match.
4259  *
4260  * A fixed prefix "foo" is estimated as the selectivity of the expression
4261  * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
4262  *
4263  * The selectivity estimate is with respect to the portion of the column
4264  * population represented by the histogram --- the caller must fold this
4265  * together with info about MCVs and NULLs.
4266  *
4267  * We use the >= and < operators from the specified btree opfamily to do the
4268  * estimation.  The given variable and Const must be of the associated
4269  * datatype.
4270  *
4271  * XXX Note: we make use of the upper bound to estimate operator selectivity
4272  * even if the locale is such that we cannot rely on the upper-bound string.
4273  * The selectivity only needs to be approximately right anyway, so it seems
4274  * more useful to use the upper-bound code than not.
4275  */
4276 static Selectivity
4277 prefix_selectivity(VariableStatData *vardata,
4278                                    Oid vartype, Oid opfamily, Const *prefixcon)
4279 {
4280         Selectivity prefixsel;
4281         Oid                     cmpopr;
4282         FmgrInfo        opproc;
4283         Const      *greaterstrcon;
4284
4285         cmpopr = get_opfamily_member(opfamily, vartype, vartype,
4286                                                                  BTGreaterEqualStrategyNumber);
4287         if (cmpopr == InvalidOid)
4288                 elog(ERROR, "no >= operator for opfamily %u", opfamily);
4289         fmgr_info(get_opcode(cmpopr), &opproc);
4290
4291         prefixsel = ineq_histogram_selectivity(vardata, &opproc, true,
4292                                                                                    prefixcon->constvalue,
4293                                                                                    prefixcon->consttype);
4294
4295         if (prefixsel <= 0.0)
4296         {
4297                 /* No histogram is present ... return a suitable default estimate */
4298                 return 0.005;
4299         }
4300
4301         /*-------
4302          * If we can create a string larger than the prefix, say
4303          *      "x < greaterstr".
4304          *-------
4305          */
4306         greaterstrcon = make_greater_string(prefixcon);
4307         if (greaterstrcon)
4308         {
4309                 Selectivity topsel;
4310
4311                 cmpopr = get_opfamily_member(opfamily, vartype, vartype,
4312                                                                          BTLessStrategyNumber);
4313                 if (cmpopr == InvalidOid)
4314                         elog(ERROR, "no < operator for opfamily %u", opfamily);
4315                 fmgr_info(get_opcode(cmpopr), &opproc);
4316
4317                 topsel = ineq_histogram_selectivity(vardata, &opproc, false,
4318                                                                                         greaterstrcon->constvalue,
4319                                                                                         greaterstrcon->consttype);
4320
4321                 /* ineq_histogram_selectivity worked before, it shouldn't fail now */
4322                 Assert(topsel > 0.0);
4323
4324                 /*
4325                  * Merge the two selectivities in the same way as for a range query
4326                  * (see clauselist_selectivity()).      Note that we don't need to worry
4327                  * about double-exclusion of nulls, since ineq_histogram_selectivity
4328                  * doesn't count those anyway.
4329                  */
4330                 prefixsel = topsel + prefixsel - 1.0;
4331
4332                 /*
4333                  * A zero or negative prefixsel should be converted into a small
4334                  * positive value; we probably are dealing with a very tight range and
4335                  * got a bogus result due to roundoff errors.
4336                  */
4337                 if (prefixsel <= 0.0)
4338                         prefixsel = 1.0e-10;
4339         }
4340
4341         return prefixsel;
4342 }
4343
4344
4345 /*
4346  * Estimate the selectivity of a pattern of the specified type.
4347  * Note that any fixed prefix of the pattern will have been removed already.
4348  *
4349  * For now, we use a very simplistic approach: fixed characters reduce the
4350  * selectivity a good deal, character ranges reduce it a little,
4351  * wildcards (such as % for LIKE or .* for regex) increase it.
4352  */
4353
4354 #define FIXED_CHAR_SEL  0.20    /* about 1/5 */
4355 #define CHAR_RANGE_SEL  0.25
4356 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match end-of-string */
4357 #define FULL_WILDCARD_SEL 5.0
4358 #define PARTIAL_WILDCARD_SEL 2.0
4359
4360 static Selectivity
4361 like_selectivity(Const *patt_const, bool case_insensitive)
4362 {
4363         Selectivity sel = 1.0;
4364         int                     pos;
4365         Oid                     typeid = patt_const->consttype;
4366         char       *patt;
4367         int                     pattlen;
4368
4369         /* the right-hand const is type text or bytea */
4370         Assert(typeid == BYTEAOID || typeid == TEXTOID);
4371
4372         if (typeid == BYTEAOID && case_insensitive)
4373                 ereport(ERROR,
4374                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4375                    errmsg("case insensitive matching not supported on type bytea")));
4376
4377         if (typeid != BYTEAOID)
4378         {
4379                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4380                 pattlen = strlen(patt);
4381         }
4382         else
4383         {
4384                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
4385
4386                 pattlen = VARSIZE(bstr) - VARHDRSZ;
4387                 patt = (char *) palloc(pattlen);
4388                 memcpy(patt, VARDATA(bstr), pattlen);
4389                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
4390                         pfree(bstr);
4391         }
4392
4393         /* Skip any leading wildcard; it's already factored into initial sel */
4394         for (pos = 0; pos < pattlen; pos++)
4395         {
4396                 if (patt[pos] != '%' && patt[pos] != '_')
4397                         break;
4398         }
4399
4400         for (; pos < pattlen; pos++)
4401         {
4402                 /* % and _ are wildcard characters in LIKE */
4403                 if (patt[pos] == '%')
4404                         sel *= FULL_WILDCARD_SEL;
4405                 else if (patt[pos] == '_')
4406                         sel *= ANY_CHAR_SEL;
4407                 else if (patt[pos] == '\\')
4408                 {
4409                         /* Backslash quotes the next character */
4410                         pos++;
4411                         if (pos >= pattlen)
4412                                 break;
4413                         sel *= FIXED_CHAR_SEL;
4414                 }
4415                 else
4416                         sel *= FIXED_CHAR_SEL;
4417         }
4418         /* Could get sel > 1 if multiple wildcards */
4419         if (sel > 1.0)
4420                 sel = 1.0;
4421
4422         pfree(patt);
4423         return sel;
4424 }
4425
4426 static Selectivity
4427 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
4428 {
4429         Selectivity sel = 1.0;
4430         int                     paren_depth = 0;
4431         int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
4432         int                     pos;
4433
4434         for (pos = 0; pos < pattlen; pos++)
4435         {
4436                 if (patt[pos] == '(')
4437                 {
4438                         if (paren_depth == 0)
4439                                 paren_pos = pos;        /* remember start of parenthesized item */
4440                         paren_depth++;
4441                 }
4442                 else if (patt[pos] == ')' && paren_depth > 0)
4443                 {
4444                         paren_depth--;
4445                         if (paren_depth == 0)
4446                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
4447                                                                                          pos - (paren_pos + 1),
4448                                                                                          case_insensitive);
4449                 }
4450                 else if (patt[pos] == '|' && paren_depth == 0)
4451                 {
4452                         /*
4453                          * If unquoted | is present at paren level 0 in pattern, we have
4454                          * multiple alternatives; sum their probabilities.
4455                          */
4456                         sel += regex_selectivity_sub(patt + (pos + 1),
4457                                                                                  pattlen - (pos + 1),
4458                                                                                  case_insensitive);
4459                         break;                          /* rest of pattern is now processed */
4460                 }
4461                 else if (patt[pos] == '[')
4462                 {
4463                         bool            negclass = false;
4464
4465                         if (patt[++pos] == '^')
4466                         {
4467                                 negclass = true;
4468                                 pos++;
4469                         }
4470                         if (patt[pos] == ']')           /* ']' at start of class is not
4471                                                                                  * special */
4472                                 pos++;
4473                         while (pos < pattlen && patt[pos] != ']')
4474                                 pos++;
4475                         if (paren_depth == 0)
4476                                 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
4477                 }
4478                 else if (patt[pos] == '.')
4479                 {
4480                         if (paren_depth == 0)
4481                                 sel *= ANY_CHAR_SEL;
4482                 }
4483                 else if (patt[pos] == '*' ||
4484                                  patt[pos] == '?' ||
4485                                  patt[pos] == '+')
4486                 {
4487                         /* Ought to be smarter about quantifiers... */
4488                         if (paren_depth == 0)
4489                                 sel *= PARTIAL_WILDCARD_SEL;
4490                 }
4491                 else if (patt[pos] == '{')
4492                 {
4493                         while (pos < pattlen && patt[pos] != '}')
4494                                 pos++;
4495                         if (paren_depth == 0)
4496                                 sel *= PARTIAL_WILDCARD_SEL;
4497                 }
4498                 else if (patt[pos] == '\\')
4499                 {
4500                         /* backslash quotes the next character */
4501                         pos++;
4502                         if (pos >= pattlen)
4503                                 break;
4504                         if (paren_depth == 0)
4505                                 sel *= FIXED_CHAR_SEL;
4506                 }
4507                 else
4508                 {
4509                         if (paren_depth == 0)
4510                                 sel *= FIXED_CHAR_SEL;
4511                 }
4512         }
4513         /* Could get sel > 1 if multiple wildcards */
4514         if (sel > 1.0)
4515                 sel = 1.0;
4516         return sel;
4517 }
4518
4519 static Selectivity
4520 regex_selectivity(Const *patt_const, bool case_insensitive)
4521 {
4522         Selectivity sel;
4523         char       *patt;
4524         int                     pattlen;
4525         Oid                     typeid = patt_const->consttype;
4526
4527         /*
4528          * Should be unnecessary, there are no bytea regex operators defined. As
4529          * such, it should be noted that the rest of this function has *not* been
4530          * made safe for binary (possibly NULL containing) strings.
4531          */
4532         if (typeid == BYTEAOID)
4533                 ereport(ERROR,
4534                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4535                  errmsg("regular-expression matching not supported on type bytea")));
4536
4537         /* the right-hand const is type text for all of these */
4538         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4539         pattlen = strlen(patt);
4540
4541         /* If patt doesn't end with $, consider it to have a trailing wildcard */
4542         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
4543                 (pattlen == 1 || patt[pattlen - 2] != '\\'))
4544         {
4545                 /* has trailing $ */
4546                 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
4547         }
4548         else
4549         {
4550                 /* no trailing $ */
4551                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
4552                 sel *= FULL_WILDCARD_SEL;
4553                 if (sel > 1.0)
4554                         sel = 1.0;
4555         }
4556         return sel;
4557 }
4558
4559 static Selectivity
4560 pattern_selectivity(Const *patt, Pattern_Type ptype)
4561 {
4562         Selectivity result;
4563
4564         switch (ptype)
4565         {
4566                 case Pattern_Type_Like:
4567                         result = like_selectivity(patt, false);
4568                         break;
4569                 case Pattern_Type_Like_IC:
4570                         result = like_selectivity(patt, true);
4571                         break;
4572                 case Pattern_Type_Regex:
4573                         result = regex_selectivity(patt, false);
4574                         break;
4575                 case Pattern_Type_Regex_IC:
4576                         result = regex_selectivity(patt, true);
4577                         break;
4578                 default:
4579                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4580                         result = 1.0;           /* keep compiler quiet */
4581                         break;
4582         }
4583         return result;
4584 }
4585
4586
4587 /*
4588  * Try to generate a string greater than the given string or any
4589  * string it is a prefix of.  If successful, return a palloc'd string
4590  * in the form of a Const pointer; else return NULL.
4591  *
4592  * The key requirement here is that given a prefix string, say "foo",
4593  * we must be able to generate another string "fop" that is greater
4594  * than all strings "foobar" starting with "foo".
4595  *
4596  * If we max out the righthand byte, truncate off the last character
4597  * and start incrementing the next.  For example, if "z" were the last
4598  * character in the sort order, then we could produce "foo" as a
4599  * string greater than "fonz".
4600  *
4601  * This could be rather slow in the worst case, but in most cases we
4602  * won't have to try more than one or two strings before succeeding.
4603  *
4604  * NOTE: at present this assumes we are in the C locale, so that simple
4605  * bytewise comparison applies.  However, we might be in a multibyte
4606  * encoding such as UTF8, so we do have to watch out for generating
4607  * invalid encoding sequences.
4608  */
4609 Const *
4610 make_greater_string(const Const *str_const)
4611 {
4612         Oid                     datatype = str_const->consttype;
4613         char       *workstr;
4614         int                     len;
4615
4616         /* Get the string and a modifiable copy */
4617         if (datatype == NAMEOID)
4618         {
4619                 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4620                                                                                                           str_const->constvalue));
4621                 len = strlen(workstr);
4622         }
4623         else if (datatype == BYTEAOID)
4624         {
4625                 bytea      *bstr = DatumGetByteaP(str_const->constvalue);
4626
4627                 len = VARSIZE(bstr) - VARHDRSZ;
4628                 workstr = (char *) palloc(len);
4629                 memcpy(workstr, VARDATA(bstr), len);
4630                 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4631                         pfree(bstr);
4632         }
4633         else
4634         {
4635                 workstr = DatumGetCString(DirectFunctionCall1(textout,
4636                                                                                                           str_const->constvalue));
4637                 len = strlen(workstr);
4638         }
4639
4640         while (len > 0)
4641         {
4642                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4643                 unsigned char savelastchar = *lastchar;
4644
4645                 /*
4646                  * Try to generate a larger string by incrementing the last byte.
4647                  */
4648                 while (*lastchar < (unsigned char) 255)
4649                 {
4650                         Const      *workstr_const;
4651
4652                         (*lastchar)++;
4653
4654                         if (datatype != BYTEAOID)
4655                         {
4656                                 /* do not generate invalid encoding sequences */
4657                                 if (!pg_verifymbstr(workstr, len, true))
4658                                         continue;
4659                                 workstr_const = string_to_const(workstr, datatype);
4660                         }
4661                         else
4662                                 workstr_const = string_to_bytea_const(workstr, len);
4663
4664                         pfree(workstr);
4665                         return workstr_const;
4666                 }
4667
4668                 /* restore last byte so we don't confuse pg_mbcliplen */
4669                 *lastchar = savelastchar;
4670
4671                 /*
4672                  * Truncate off the last character, which might be more than 1 byte,
4673                  * depending on the character encoding.
4674                  */
4675                 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4676                         len = pg_mbcliplen(workstr, len, len - 1);
4677                 else
4678                         len -= 1;
4679
4680                 if (datatype != BYTEAOID)
4681                         workstr[len] = '\0';
4682         }
4683
4684         /* Failed... */
4685         pfree(workstr);
4686
4687         return NULL;
4688 }
4689
4690 /*
4691  * Generate a Datum of the appropriate type from a C string.
4692  * Note that all of the supported types are pass-by-ref, so the
4693  * returned value should be pfree'd if no longer needed.
4694  */
4695 static Datum
4696 string_to_datum(const char *str, Oid datatype)
4697 {
4698         Assert(str != NULL);
4699
4700         /*
4701          * We cheat a little by assuming that textin() will do for bpchar and
4702          * varchar constants too...
4703          */
4704         if (datatype == NAMEOID)
4705                 return DirectFunctionCall1(namein, CStringGetDatum(str));
4706         else if (datatype == BYTEAOID)
4707                 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4708         else
4709                 return DirectFunctionCall1(textin, CStringGetDatum(str));
4710 }
4711
4712 /*
4713  * Generate a Const node of the appropriate type from a C string.
4714  */
4715 static Const *
4716 string_to_const(const char *str, Oid datatype)
4717 {
4718         Datum           conval = string_to_datum(str, datatype);
4719
4720         return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4721                                          conval, false, false);
4722 }
4723
4724 /*
4725  * Generate a Const node of bytea type from a binary C string and a length.
4726  */
4727 static Const *
4728 string_to_bytea_const(const char *str, size_t str_len)
4729 {
4730         bytea      *bstr = palloc(VARHDRSZ + str_len);
4731         Datum           conval;
4732
4733         memcpy(VARDATA(bstr), str, str_len);
4734         VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4735         conval = PointerGetDatum(bstr);
4736
4737         return makeConst(BYTEAOID, -1, conval, false, false);
4738 }
4739
4740 /*-------------------------------------------------------------------------
4741  *
4742  * Index cost estimation functions
4743  *
4744  * genericcostestimate is a general-purpose estimator for use when we
4745  * don't have any better idea about how to estimate.  Index-type-specific
4746  * knowledge can be incorporated in the type-specific routines.
4747  *
4748  * One bit of index-type-specific knowledge we can relatively easily use
4749  * in genericcostestimate is the estimate of the number of index tuples
4750  * visited.  If numIndexTuples is not 0 then it is used as the estimate,
4751  * otherwise we compute a generic estimate.
4752  *
4753  *-------------------------------------------------------------------------
4754  */
4755
4756 static void
4757 genericcostestimate(PlannerInfo *root,
4758                                         IndexOptInfo *index, List *indexQuals,
4759                                         RelOptInfo *outer_rel,
4760                                         double numIndexTuples,
4761                                         Cost *indexStartupCost,
4762                                         Cost *indexTotalCost,
4763                                         Selectivity *indexSelectivity,
4764                                         double *indexCorrelation)
4765 {
4766         double          numIndexPages;
4767         double          num_sa_scans;
4768         double          num_outer_scans;
4769         double          num_scans;
4770         QualCost        index_qual_cost;
4771         double          qual_op_cost;
4772         double          qual_arg_cost;
4773         List       *selectivityQuals;
4774         ListCell   *l;
4775
4776         /*
4777          * If the index is partial, AND the index predicate with the explicitly
4778          * given indexquals to produce a more accurate idea of the index
4779          * selectivity.  This may produce redundant clauses.  We get rid of exact
4780          * duplicates in the code below.  We expect that most cases of partial
4781          * redundancy (such as "x < 4" from the qual and "x < 5" from the
4782          * predicate) will be recognized and handled correctly by
4783          * clauselist_selectivity().  This assumption is somewhat fragile, since
4784          * it depends on predicate_implied_by() and clauselist_selectivity()
4785          * having similar capabilities, and there are certainly many cases where
4786          * we will end up with a too-low selectivity estimate.  This will bias the
4787          * system in favor of using partial indexes where possible, which is not
4788          * necessarily a bad thing. But it'd be nice to do better someday.
4789          *
4790          * Note that index->indpred and indexQuals are both in implicit-AND form,
4791          * so ANDing them together just takes merging the lists.  However,
4792          * eliminating duplicates is a bit trickier because indexQuals contains
4793          * RestrictInfo nodes and the indpred does not.  It is okay to pass a
4794          * mixed list to clauselist_selectivity, but we have to work a bit to
4795          * generate a list without logical duplicates.  (We could just list_union
4796          * indpred and strippedQuals, but then we'd not get caching of per-qual
4797          * selectivity estimates.)
4798          */
4799         if (index->indpred != NIL)
4800         {
4801                 List       *strippedQuals;
4802                 List       *predExtraQuals;
4803
4804                 strippedQuals = get_actual_clauses(indexQuals);
4805                 predExtraQuals = list_difference(index->indpred, strippedQuals);
4806                 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4807         }
4808         else
4809                 selectivityQuals = indexQuals;
4810
4811         /*
4812          * Check for ScalarArrayOpExpr index quals, and estimate the number of
4813          * index scans that will be performed.
4814          */
4815         num_sa_scans = 1;
4816         foreach(l, indexQuals)
4817         {
4818                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4819
4820                 if (IsA(rinfo->clause, ScalarArrayOpExpr))
4821                 {
4822                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
4823                         int                     alength = estimate_array_length(lsecond(saop->args));
4824
4825                         if (alength > 1)
4826                                 num_sa_scans *= alength;
4827                 }
4828         }
4829
4830         /* Estimate the fraction of main-table tuples that will be visited */
4831         *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4832                                                                                            index->rel->relid,
4833                                                                                            JOIN_INNER);
4834
4835         /*
4836          * If caller didn't give us an estimate, estimate the number of index
4837          * tuples that will be visited.  We do it in this rather peculiar-looking
4838          * way in order to get the right answer for partial indexes.
4839          */
4840         if (numIndexTuples <= 0.0)
4841         {
4842                 numIndexTuples = *indexSelectivity * index->rel->tuples;
4843
4844                 /*
4845                  * The above calculation counts all the tuples visited across all
4846                  * scans induced by ScalarArrayOpExpr nodes.  We want to consider the
4847                  * average per-indexscan number, so adjust.  This is a handy place to
4848                  * round to integer, too.  (If caller supplied tuple estimate, it's
4849                  * responsible for handling these considerations.)
4850                  */
4851                 numIndexTuples = rint(numIndexTuples / num_sa_scans);
4852         }
4853
4854         /*
4855          * We can bound the number of tuples by the index size in any case. Also,
4856          * always estimate at least one tuple is touched, even when
4857          * indexSelectivity estimate is tiny.
4858          */
4859         if (numIndexTuples > index->tuples)
4860                 numIndexTuples = index->tuples;
4861         if (numIndexTuples < 1.0)
4862                 numIndexTuples = 1.0;
4863
4864         /*
4865          * Estimate the number of index pages that will be retrieved.
4866          *
4867          * We use the simplistic method of taking a pro-rata fraction of the total
4868          * number of index pages.  In effect, this counts only leaf pages and not
4869          * any overhead such as index metapage or upper tree levels. In practice
4870          * this seems a better approximation than charging for access to the upper
4871          * levels, perhaps because those tend to stay in cache under load.
4872          */
4873         if (index->pages > 1 && index->tuples > 1)
4874                 numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
4875         else
4876                 numIndexPages = 1.0;
4877
4878         /*
4879          * Now compute the disk access costs.
4880          *
4881          * The above calculations are all per-index-scan.  However, if we are in a
4882          * nestloop inner scan, we can expect the scan to be repeated (with
4883          * different search keys) for each row of the outer relation.  Likewise,
4884          * ScalarArrayOpExpr quals result in multiple index scans.      This creates
4885          * the potential for cache effects to reduce the number of disk page
4886          * fetches needed.      We want to estimate the average per-scan I/O cost in
4887          * the presence of caching.
4888          *
4889          * We use the Mackert-Lohman formula (see costsize.c for details) to
4890          * estimate the total number of page fetches that occur.  While this
4891          * wasn't what it was designed for, it seems a reasonable model anyway.
4892          * Note that we are counting pages not tuples anymore, so we take N = T =
4893          * index size, as if there were one "tuple" per page.
4894          */
4895         if (outer_rel != NULL && outer_rel->rows > 1)
4896         {
4897                 num_outer_scans = outer_rel->rows;
4898                 num_scans = num_sa_scans * num_outer_scans;
4899         }
4900         else
4901         {
4902                 num_outer_scans = 1;
4903                 num_scans = num_sa_scans;
4904         }
4905
4906         if (num_scans > 1)
4907         {
4908                 double          pages_fetched;
4909
4910                 /* total page fetches ignoring cache effects */
4911                 pages_fetched = numIndexPages * num_scans;
4912
4913                 /* use Mackert and Lohman formula to adjust for cache effects */
4914                 pages_fetched = index_pages_fetched(pages_fetched,
4915                                                                                         index->pages,
4916                                                                                         (double) index->pages,
4917                                                                                         root);
4918
4919                 /*
4920                  * Now compute the total disk access cost, and then report a pro-rated
4921                  * share for each outer scan.  (Don't pro-rate for ScalarArrayOpExpr,
4922                  * since that's internal to the indexscan.)
4923                  */
4924                 *indexTotalCost = (pages_fetched * random_page_cost) / num_outer_scans;
4925         }
4926         else
4927         {
4928                 /*
4929                  * For a single index scan, we just charge random_page_cost per page
4930                  * touched.
4931                  */
4932                 *indexTotalCost = numIndexPages * random_page_cost;
4933         }
4934
4935         /*
4936          * A difficulty with the leaf-pages-only cost approach is that for small
4937          * selectivities (eg, single index tuple fetched) all indexes will look
4938          * equally attractive because we will estimate exactly 1 leaf page to be
4939          * fetched.  All else being equal, we should prefer physically smaller
4940          * indexes over larger ones.  (An index might be smaller because it is
4941          * partial or because it contains fewer columns; presumably the other
4942          * columns in the larger index aren't useful to the query, or the larger
4943          * index would have better selectivity.)
4944          *
4945          * We can deal with this by adding a very small "fudge factor" that
4946          * depends on the index size.  The fudge factor used here is one
4947          * random_page_cost per 100000 index pages, which should be small enough
4948          * to not alter index-vs-seqscan decisions, but will prevent indexes of
4949          * different sizes from looking exactly equally attractive.
4950          */
4951         *indexTotalCost += index->pages * random_page_cost / 100000.0;
4952
4953         /*
4954          * CPU cost: any complex expressions in the indexquals will need to be
4955          * evaluated once at the start of the scan to reduce them to runtime keys
4956          * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
4957          * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
4958          * indexqual operator.  Because we have numIndexTuples as a per-scan
4959          * number, we have to multiply by num_sa_scans to get the correct result
4960          * for ScalarArrayOpExpr cases.
4961          *
4962          * Note: this neglects the possible costs of rechecking lossy operators
4963          * and OR-clause expressions.  Detecting that that might be needed seems
4964          * more expensive than it's worth, though, considering all the other
4965          * inaccuracies here ...
4966          */
4967         cost_qual_eval(&index_qual_cost, indexQuals);
4968         qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4969         qual_arg_cost = index_qual_cost.startup +
4970                 index_qual_cost.per_tuple - qual_op_cost;
4971         if (qual_arg_cost < 0)          /* just in case... */
4972                 qual_arg_cost = 0;
4973         *indexStartupCost = qual_arg_cost;
4974         *indexTotalCost += qual_arg_cost;
4975         *indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
4976
4977         /*
4978          * We also add a CPU-cost component to represent the general costs of
4979          * starting an indexscan, such as analysis of btree index keys and
4980          * initial tree descent.  This is estimated at 100x cpu_operator_cost,
4981          * which is a bit arbitrary but seems the right order of magnitude.
4982          * (As noted above, we don't charge any I/O for touching upper tree
4983          * levels, but charging nothing at all has been found too optimistic.)
4984          *
4985          * Although this is startup cost with respect to any one scan, we add
4986          * it to the "total" cost component because it's only very interesting
4987          * in the many-ScalarArrayOpExpr-scan case, and there it will be paid
4988          * over the life of the scan node.
4989          */
4990         *indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost;
4991
4992         /*
4993          * Generic assumption about index correlation: there isn't any.
4994          */
4995         *indexCorrelation = 0.0;
4996 }
4997
4998
4999 Datum
5000 btcostestimate(PG_FUNCTION_ARGS)
5001 {
5002         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5003         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5004         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
5005         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5006         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5007         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5008         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5009         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5010         Oid                     relid;
5011         AttrNumber      colnum;
5012         HeapTuple       tuple;
5013         double          numIndexTuples;
5014         List       *indexBoundQuals;
5015         int                     indexcol;
5016         bool            eqQualHere;
5017         bool            found_saop;
5018         double          num_sa_scans;
5019         ListCell   *l;
5020
5021         /*
5022          * For a btree scan, only leading '=' quals plus inequality quals for the
5023          * immediately next attribute contribute to index selectivity (these are
5024          * the "boundary quals" that determine the starting and stopping points of
5025          * the index scan).  Additional quals can suppress visits to the heap, so
5026          * it's OK to count them in indexSelectivity, but they should not count
5027          * for estimating numIndexTuples.  So we must examine the given indexQuals
5028          * to find out which ones count as boundary quals.      We rely on the
5029          * knowledge that they are given in index column order.
5030          *
5031          * For a RowCompareExpr, we consider only the first column, just as
5032          * rowcomparesel() does.
5033          *
5034          * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N
5035          * index scans not one, but the ScalarArrayOpExpr's operator can be
5036          * considered to act the same as it normally does.
5037          */
5038         indexBoundQuals = NIL;
5039         indexcol = 0;
5040         eqQualHere = false;
5041         found_saop = false;
5042         num_sa_scans = 1;
5043         foreach(l, indexQuals)
5044         {
5045                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
5046                 Expr       *clause;
5047                 Node       *leftop,
5048                                    *rightop;
5049                 Oid                     clause_op;
5050                 int                     op_strategy;
5051
5052                 Assert(IsA(rinfo, RestrictInfo));
5053                 clause = rinfo->clause;
5054                 if (IsA(clause, OpExpr))
5055                 {
5056                         leftop = get_leftop(clause);
5057                         rightop = get_rightop(clause);
5058                         clause_op = ((OpExpr *) clause)->opno;
5059                 }
5060                 else if (IsA(clause, RowCompareExpr))
5061                 {
5062                         RowCompareExpr *rc = (RowCompareExpr *) clause;
5063
5064                         leftop = (Node *) linitial(rc->largs);
5065                         rightop = (Node *) linitial(rc->rargs);
5066                         clause_op = linitial_oid(rc->opnos);
5067                 }
5068                 else if (IsA(clause, ScalarArrayOpExpr))
5069                 {
5070                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
5071
5072                         leftop = (Node *) linitial(saop->args);
5073                         rightop = (Node *) lsecond(saop->args);
5074                         clause_op = saop->opno;
5075                         found_saop = true;
5076                 }
5077                 else
5078                 {
5079                         elog(ERROR, "unsupported indexqual type: %d",
5080                                  (int) nodeTag(clause));
5081                         continue;                       /* keep compiler quiet */
5082                 }
5083                 if (match_index_to_operand(leftop, indexcol, index))
5084                 {
5085                         /* clause_op is correct */
5086                 }
5087                 else if (match_index_to_operand(rightop, indexcol, index))
5088                 {
5089                         /* Must flip operator to get the opfamily member */
5090                         clause_op = get_commutator(clause_op);
5091                 }
5092                 else
5093                 {
5094                         /* Must be past the end of quals for indexcol, try next */
5095                         if (!eqQualHere)
5096                                 break;                  /* done if no '=' qual for indexcol */
5097                         indexcol++;
5098                         eqQualHere = false;
5099                         if (match_index_to_operand(leftop, indexcol, index))
5100                         {
5101                                 /* clause_op is correct */
5102                         }
5103                         else if (match_index_to_operand(rightop, indexcol, index))
5104                         {
5105                                 /* Must flip operator to get the opfamily member */
5106                                 clause_op = get_commutator(clause_op);
5107                         }
5108                         else
5109                         {
5110                                 /* No quals for new indexcol, so we are done */
5111                                 break;
5112                         }
5113                 }
5114                 op_strategy = get_op_opfamily_strategy(clause_op,
5115                                                                                            index->opfamily[indexcol]);
5116                 Assert(op_strategy != 0);               /* not a member of opfamily?? */
5117                 if (op_strategy == BTEqualStrategyNumber)
5118                         eqQualHere = true;
5119                 /* count up number of SA scans induced by indexBoundQuals only */
5120                 if (IsA(clause, ScalarArrayOpExpr))
5121                 {
5122                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
5123                         int                     alength = estimate_array_length(lsecond(saop->args));
5124
5125                         if (alength > 1)
5126                                 num_sa_scans *= alength;
5127                 }
5128                 indexBoundQuals = lappend(indexBoundQuals, rinfo);
5129         }
5130
5131         /*
5132          * If index is unique and we found an '=' clause for each column, we can
5133          * just assume numIndexTuples = 1 and skip the expensive
5134          * clauselist_selectivity calculations.
5135          */
5136         if (index->unique &&
5137                 indexcol == index->ncolumns - 1 &&
5138                 eqQualHere &&
5139                 !found_saop)
5140                 numIndexTuples = 1.0;
5141         else
5142         {
5143                 Selectivity btreeSelectivity;
5144
5145                 btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
5146                                                                                                   index->rel->relid,
5147                                                                                                   JOIN_INNER);
5148                 numIndexTuples = btreeSelectivity * index->rel->tuples;
5149                 /*
5150                  * As in genericcostestimate(), we have to adjust for any
5151                  * ScalarArrayOpExpr quals included in indexBoundQuals, and then
5152                  * round to integer.
5153                  */
5154                 numIndexTuples = rint(numIndexTuples / num_sa_scans);
5155         }
5156
5157         genericcostestimate(root, index, indexQuals, outer_rel, numIndexTuples,
5158                                                 indexStartupCost, indexTotalCost,
5159                                                 indexSelectivity, indexCorrelation);
5160
5161         /*
5162          * If we can get an estimate of the first column's ordering correlation C
5163          * from pg_statistic, estimate the index correlation as C for a
5164          * single-column index, or C * 0.75 for multiple columns. (The idea here
5165          * is that multiple columns dilute the importance of the first column's
5166          * ordering, but don't negate it entirely.  Before 8.0 we divided the
5167          * correlation by the number of columns, but that seems too strong.)
5168          *
5169          * We can skip all this if we found a ScalarArrayOpExpr, because then the
5170          * call must be for a bitmap index scan, and the caller isn't going to
5171          * care what the index correlation is.
5172          */
5173         if (found_saop)
5174                 PG_RETURN_VOID();
5175
5176         if (index->indexkeys[0] != 0)
5177         {
5178                 /* Simple variable --- look to stats for the underlying table */
5179                 relid = getrelid(index->rel->relid, root->parse->rtable);
5180                 Assert(relid != InvalidOid);
5181                 colnum = index->indexkeys[0];
5182         }
5183         else
5184         {
5185                 /* Expression --- maybe there are stats for the index itself */
5186                 relid = index->indexoid;
5187                 colnum = 1;
5188         }
5189
5190         tuple = SearchSysCache(STATRELATT,
5191                                                    ObjectIdGetDatum(relid),
5192                                                    Int16GetDatum(colnum),
5193                                                    0, 0);
5194
5195         if (HeapTupleIsValid(tuple))
5196         {
5197                 float4     *numbers;
5198                 int                     nnumbers;
5199
5200                 if (get_attstatsslot(tuple, InvalidOid, 0,
5201                                                          STATISTIC_KIND_CORRELATION,
5202                                                          index->fwdsortop[0],
5203                                                          NULL, NULL, &numbers, &nnumbers))
5204                 {
5205                         double          varCorrelation;
5206
5207                         Assert(nnumbers == 1);
5208                         varCorrelation = numbers[0];
5209
5210                         if (index->ncolumns > 1)
5211                                 *indexCorrelation = varCorrelation * 0.75;
5212                         else
5213                                 *indexCorrelation = varCorrelation;
5214
5215                         free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
5216                 }
5217                 else if (get_attstatsslot(tuple, InvalidOid, 0,
5218                                                                   STATISTIC_KIND_CORRELATION,
5219                                                                   index->revsortop[0],
5220                                                                   NULL, NULL, &numbers, &nnumbers))
5221                 {
5222                         double          varCorrelation;
5223
5224                         Assert(nnumbers == 1);
5225                         varCorrelation = numbers[0];
5226
5227                         if (index->ncolumns > 1)
5228                                 *indexCorrelation = - varCorrelation * 0.75;
5229                         else
5230                                 *indexCorrelation = - varCorrelation;
5231
5232                         free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
5233                 }
5234                 ReleaseSysCache(tuple);
5235         }
5236
5237         PG_RETURN_VOID();
5238 }
5239
5240 Datum
5241 hashcostestimate(PG_FUNCTION_ARGS)
5242 {
5243         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5244         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5245         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
5246         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5247         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5248         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5249         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5250         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5251
5252         genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5253                                                 indexStartupCost, indexTotalCost,
5254                                                 indexSelectivity, indexCorrelation);
5255
5256         PG_RETURN_VOID();
5257 }
5258
5259 Datum
5260 gistcostestimate(PG_FUNCTION_ARGS)
5261 {
5262         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5263         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5264         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
5265         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5266         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5267         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5268         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5269         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5270
5271         genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5272                                                 indexStartupCost, indexTotalCost,
5273                                                 indexSelectivity, indexCorrelation);
5274
5275         PG_RETURN_VOID();
5276 }
5277
5278 Datum
5279 gincostestimate(PG_FUNCTION_ARGS)
5280 {
5281         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5282         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5283         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
5284         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5285         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5286         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5287         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5288         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5289
5290         genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5291                                                 indexStartupCost, indexTotalCost,
5292                                                 indexSelectivity, indexCorrelation);
5293
5294         PG_RETURN_VOID();
5295 }