granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * selfuncs.c
   4  *        Selectivity functions and index cost estimation functions for
   5  *        standard operators and index access methods.
   6  *
   7  *        Selectivity routines are registered in the pg_operator catalog
   8  *        in the "oprrest" and "oprjoin" attributes.
   9  *
  10  *        Index cost functions are registered in the pg_am catalog
  11  *        in the "amcostestimate" attribute.
  12  *
  13  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  14  * Portions Copyright (c) 1994, Regents of the University of California
  15  *
  16  *
  17  * IDENTIFICATION
  18  *        $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.219 2007/01/09 02:14:14 tgl Exp $
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22
  23 /*----------
  24  * Operator selectivity estimation functions are called to estimate the
  25  * selectivity of WHERE clauses whose top-level operator is their operator.
  26  * We divide the problem into two cases:
  27  *              Restriction clause estimation: the clause involves vars of just
  28  *                      one relation.
  29  *              Join clause estimation: the clause involves vars of multiple rels.
  30  * Join selectivity estimation is far more difficult and usually less accurate
  31  * than restriction estimation.
  32  *
  33  * When dealing with the inner scan of a nestloop join, we consider the
  34  * join's joinclauses as restriction clauses for the inner relation, and
  35  * treat vars of the outer relation as parameters (a/k/a constants of unknown
  36  * values).  So, restriction estimators need to be able to accept an argument
  37  * telling which relation is to be treated as the variable.
  38  *
  39  * The call convention for a restriction estimator (oprrest function) is
  40  *
  41  *              Selectivity oprrest (PlannerInfo *root,
  42  *                                                       Oid operator,
  43  *                                                       List *args,
  44  *                                                       int varRelid);
  45  *
  46  * root: general information about the query (rtable and RelOptInfo lists
  47  * are particularly important for the estimator).
  48  * operator: OID of the specific operator in question.
  49  * args: argument list from the operator clause.
  50  * varRelid: if not zero, the relid (rtable index) of the relation to
  51  * be treated as the variable relation.  May be zero if the args list
  52  * is known to contain vars of only one relation.
  53  *
  54  * This is represented at the SQL level (in pg_proc) as
  55  *
  56  *              float8 oprrest (internal, oid, internal, int4);
  57  *
  58  * The call convention for a join estimator (oprjoin function) is similar
  59  * except that varRelid is not needed, and instead the join type is
  60  * supplied:
  61  *
  62  *              Selectivity oprjoin (PlannerInfo *root,
  63  *                                                       Oid operator,
  64  *                                                       List *args,
  65  *                                                       JoinType jointype);
  66  *
  67  *              float8 oprjoin (internal, oid, internal, int2);
  68  *
  69  * (We deliberately make the SQL signature different to facilitate
  70  * catching errors.)
  71  *----------
  72  */
  73
  74 #include "postgres.h"
  75
  76 #include <ctype.h>
  77 #include <math.h>
  78
  79 #include "catalog/pg_opfamily.h"
  80 #include "catalog/pg_statistic.h"
  81 #include "catalog/pg_type.h"
  82 #include "mb/pg_wchar.h"
  83 #include "nodes/makefuncs.h"
  84 #include "optimizer/clauses.h"
  85 #include "optimizer/cost.h"
  86 #include "optimizer/pathnode.h"
  87 #include "optimizer/paths.h"
  88 #include "optimizer/plancat.h"
  89 #include "optimizer/restrictinfo.h"
  90 #include "optimizer/var.h"
  91 #include "parser/parse_expr.h"
  92 #include "parser/parsetree.h"
  93 #include "utils/builtins.h"
  94 #include "utils/date.h"
  95 #include "utils/datum.h"
  96 #include "utils/lsyscache.h"
  97 #include "utils/nabstime.h"
  98 #include "utils/pg_locale.h"
  99 #include "utils/selfuncs.h"
 100 #include "utils/syscache.h"
 101
 102
 103 static double ineq_histogram_selectivity(VariableStatData *vardata,
 104                                                    FmgrInfo *opproc, bool isgt,
 105                                                    Datum constval, Oid consttype);
 106 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 107                                   Datum lobound, Datum hibound, Oid boundstypid,
 108                                   double *scaledlobound, double *scaledhibound);
 109 static double convert_numeric_to_scalar(Datum value, Oid typid);
 110 static void convert_string_to_scalar(char *value,
 111                                                  double *scaledvalue,
 112                                                  char *lobound,
 113                                                  double *scaledlobound,
 114                                                  char *hibound,
 115                                                  double *scaledhibound);
 116 static void convert_bytea_to_scalar(Datum value,
 117                                                 double *scaledvalue,
 118                                                 Datum lobound,
 119                                                 double *scaledlobound,
 120                                                 Datum hibound,
 121                                                 double *scaledhibound);
 122 static double convert_one_string_to_scalar(char *value,
 123                                                          int rangelo, int rangehi);
 124 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 125                                                         int rangelo, int rangehi);
 126 static char *convert_string_datum(Datum value, Oid typid);
 127 static double convert_timevalue_to_scalar(Datum value, Oid typid);
 128 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
 129                                          Oid sortop, Datum *max);
 130 static Selectivity prefix_selectivity(VariableStatData *vardata,
 131                                    Oid vartype, Oid opfamily, Const *prefixcon);
 132 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
 133 static Datum string_to_datum(const char *str, Oid datatype);
 134 static Const *string_to_const(const char *str, Oid datatype);
 135 static Const *string_to_bytea_const(const char *str, size_t str_len);
 136
 137
 138 /*
 139  *              eqsel                   - Selectivity of "=" for any data types.
 140  *
 141  * Note: this routine is also used to estimate selectivity for some
 142  * operators that are not "=" but have comparable selectivity behavior,
 143  * such as "~=" (geometric approximate-match).  Even for "=", we must
 144  * keep in mind that the left and right datatypes may differ.
 145  */
 146 Datum
 147 eqsel(PG_FUNCTION_ARGS)
 148 {
 149         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 150         Oid                     operator = PG_GETARG_OID(1);
 151         List       *args = (List *) PG_GETARG_POINTER(2);
 152         int                     varRelid = PG_GETARG_INT32(3);
 153         VariableStatData vardata;
 154         Node       *other;
 155         bool            varonleft;
 156         Datum      *values;
 157         int                     nvalues;
 158         float4     *numbers;
 159         int                     nnumbers;
 160         double          selec;
 161
 162         /*
 163          * If expression is not variable = something or something = variable, then
 164          * punt and return a default estimate.
 165          */
 166         if (!get_restriction_variable(root, args, varRelid,
 167                                                                   &vardata, &other, &varonleft))
 168                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 169
 170         /*
 171          * If the something is a NULL constant, assume operator is strict and
 172          * return zero, ie, operator will never return TRUE.
 173          */
 174         if (IsA(other, Const) &&
 175                 ((Const *) other)->constisnull)
 176         {
 177                 ReleaseVariableStats(vardata);
 178                 PG_RETURN_FLOAT8(0.0);
 179         }
 180
 181         if (HeapTupleIsValid(vardata.statsTuple))
 182         {
 183                 Form_pg_statistic stats;
 184
 185                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 186
 187                 if (IsA(other, Const))
 188                 {
 189                         /* Variable is being compared to a known non-null constant */
 190                         Datum           constval = ((Const *) other)->constvalue;
 191                         bool            match = false;
 192                         int                     i;
 193
 194                         /*
 195                          * Is the constant "=" to any of the column's most common values?
 196                          * (Although the given operator may not really be "=", we will
 197                          * assume that seeing whether it returns TRUE is an appropriate
 198                          * test.  If you don't like this, maybe you shouldn't be using
 199                          * eqsel for your operator...)
 200                          */
 201                         if (get_attstatsslot(vardata.statsTuple,
 202                                                                  vardata.atttype, vardata.atttypmod,
 203                                                                  STATISTIC_KIND_MCV, InvalidOid,
 204                                                                  &values, &nvalues,
 205                                                                  &numbers, &nnumbers))
 206                         {
 207                                 FmgrInfo        eqproc;
 208
 209                                 fmgr_info(get_opcode(operator), &eqproc);
 210
 211                                 for (i = 0; i < nvalues; i++)
 212                                 {
 213                                         /* be careful to apply operator right way 'round */
 214                                         if (varonleft)
 215                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 216                                                                                                                    values[i],
 217                                                                                                                    constval));
 218                                         else
 219                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 220                                                                                                                    constval,
 221                                                                                                                    values[i]));
 222                                         if (match)
 223                                                 break;
 224                                 }
 225                         }
 226                         else
 227                         {
 228                                 /* no most-common-value info available */
 229                                 values = NULL;
 230                                 numbers = NULL;
 231                                 i = nvalues = nnumbers = 0;
 232                         }
 233
 234                         if (match)
 235                         {
 236                                 /*
 237                                  * Constant is "=" to this common value.  We know selectivity
 238                                  * exactly (or as exactly as ANALYZE could calculate it,
 239                                  * anyway).
 240                                  */
 241                                 selec = numbers[i];
 242                         }
 243                         else
 244                         {
 245                                 /*
 246                                  * Comparison is against a constant that is neither NULL nor
 247                                  * any of the common values.  Its selectivity cannot be more
 248                                  * than this:
 249                                  */
 250                                 double          sumcommon = 0.0;
 251                                 double          otherdistinct;
 252
 253                                 for (i = 0; i < nnumbers; i++)
 254                                         sumcommon += numbers[i];
 255                                 selec = 1.0 - sumcommon - stats->stanullfrac;
 256                                 CLAMP_PROBABILITY(selec);
 257
 258                                 /*
 259                                  * and in fact it's probably a good deal less. We approximate
 260                                  * that all the not-common values share this remaining
 261                                  * fraction equally, so we divide by the number of other
 262                                  * distinct values.
 263                                  */
 264                                 otherdistinct = get_variable_numdistinct(&vardata)
 265                                         - nnumbers;
 266                                 if (otherdistinct > 1)
 267                                         selec /= otherdistinct;
 268
 269                                 /*
 270                                  * Another cross-check: selectivity shouldn't be estimated as
 271                                  * more than the least common "most common value".
 272                                  */
 273                                 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
 274                                         selec = numbers[nnumbers - 1];
 275                         }
 276
 277                         free_attstatsslot(vardata.atttype, values, nvalues,
 278                                                           numbers, nnumbers);
 279                 }
 280                 else
 281                 {
 282                         double          ndistinct;
 283
 284                         /*
 285                          * Search is for a value that we do not know a priori, but we will
 286                          * assume it is not NULL.  Estimate the selectivity as non-null
 287                          * fraction divided by number of distinct values, so that we get a
 288                          * result averaged over all possible values whether common or
 289                          * uncommon.  (Essentially, we are assuming that the not-yet-known
 290                          * comparison value is equally likely to be any of the possible
 291                          * values, regardless of their frequency in the table.  Is that a
 292                          * good idea?)
 293                          */
 294                         selec = 1.0 - stats->stanullfrac;
 295                         ndistinct = get_variable_numdistinct(&vardata);
 296                         if (ndistinct > 1)
 297                                 selec /= ndistinct;
 298
 299                         /*
 300                          * Cross-check: selectivity should never be estimated as more than
 301                          * the most common value's.
 302                          */
 303                         if (get_attstatsslot(vardata.statsTuple,
 304                                                                  vardata.atttype, vardata.atttypmod,
 305                                                                  STATISTIC_KIND_MCV, InvalidOid,
 306                                                                  NULL, NULL,
 307                                                                  &numbers, &nnumbers))
 308                         {
 309                                 if (nnumbers > 0 && selec > numbers[0])
 310                                         selec = numbers[0];
 311                                 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
 312                         }
 313                 }
 314         }
 315         else
 316         {
 317                 /*
 318                  * No ANALYZE stats available, so make a guess using estimated number
 319                  * of distinct values and assuming they are equally common. (The guess
 320                  * is unlikely to be very good, but we do know a few special cases.)
 321                  */
 322                 selec = 1.0 / get_variable_numdistinct(&vardata);
 323         }
 324
 325         ReleaseVariableStats(vardata);
 326
 327         /* result should be in range, but make sure... */
 328         CLAMP_PROBABILITY(selec);
 329
 330         PG_RETURN_FLOAT8((float8) selec);
 331 }
 332
 333 /*
 334  *              neqsel                  - Selectivity of "!=" for any data types.
 335  *
 336  * This routine is also used for some operators that are not "!="
 337  * but have comparable selectivity behavior.  See above comments
 338  * for eqsel().
 339  */
 340 Datum
 341 neqsel(PG_FUNCTION_ARGS)
 342 {
 343         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 344         Oid                     operator = PG_GETARG_OID(1);
 345         List       *args = (List *) PG_GETARG_POINTER(2);
 346         int                     varRelid = PG_GETARG_INT32(3);
 347         Oid                     eqop;
 348         float8          result;
 349
 350         /*
 351          * We want 1 - eqsel() where the equality operator is the one associated
 352          * with this != operator, that is, its negator.
 353          */
 354         eqop = get_negator(operator);
 355         if (eqop)
 356         {
 357                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 358                                                                                                         PointerGetDatum(root),
 359                                                                                                         ObjectIdGetDatum(eqop),
 360                                                                                                         PointerGetDatum(args),
 361                                                                                                         Int32GetDatum(varRelid)));
 362         }
 363         else
 364         {
 365                 /* Use default selectivity (should we raise an error instead?) */
 366                 result = DEFAULT_EQ_SEL;
 367         }
 368         result = 1.0 - result;
 369         PG_RETURN_FLOAT8(result);
 370 }
 371
 372 /*
 373  *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
 374  *
 375  * This is the guts of both scalarltsel and scalargtsel.  The caller has
 376  * commuted the clause, if necessary, so that we can treat the variable as
 377  * being on the left.  The caller must also make sure that the other side
 378  * of the clause is a non-null Const, and dissect same into a value and
 379  * datatype.
 380  *
 381  * This routine works for any datatype (or pair of datatypes) known to
 382  * convert_to_scalar().  If it is applied to some other datatype,
 383  * it will return a default estimate.
 384  */
 385 static double
 386 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 387                           VariableStatData *vardata, Datum constval, Oid consttype)
 388 {
 389         Form_pg_statistic stats;
 390         FmgrInfo        opproc;
 391         double          mcv_selec,
 392                                 hist_selec,
 393                                 sumcommon;
 394         double          selec;
 395
 396         if (!HeapTupleIsValid(vardata->statsTuple))
 397         {
 398                 /* no stats available, so default result */
 399                 return DEFAULT_INEQ_SEL;
 400         }
 401         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 402
 403         fmgr_info(get_opcode(operator), &opproc);
 404
 405         /*
 406          * If we have most-common-values info, add up the fractions of the MCV
 407          * entries that satisfy MCV OP CONST.  These fractions contribute directly
 408          * to the result selectivity.  Also add up the total fraction represented
 409          * by MCV entries.
 410          */
 411         mcv_selec = mcv_selectivity(vardata, &opproc, constval, true,
 412                                                                 &sumcommon);
 413
 414         /*
 415          * If there is a histogram, determine which bin the constant falls in, and
 416          * compute the resulting contribution to selectivity.
 417          */
 418         hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt,
 419                                                                                         constval, consttype);
 420
 421         /*
 422          * Now merge the results from the MCV and histogram calculations,
 423          * realizing that the histogram covers only the non-null values that are
 424          * not listed in MCV.
 425          */
 426         selec = 1.0 - stats->stanullfrac - sumcommon;
 427
 428         if (hist_selec > 0.0)
 429                 selec *= hist_selec;
 430         else
 431         {
 432                 /*
 433                  * If no histogram but there are values not accounted for by MCV,
 434                  * arbitrarily assume half of them will match.
 435                  */
 436                 selec *= 0.5;
 437         }
 438
 439         selec += mcv_selec;
 440
 441         /* result should be in range, but make sure... */
 442         CLAMP_PROBABILITY(selec);
 443
 444         return selec;
 445 }
 446
 447 /*
 448  *      mcv_selectivity                 - Examine the MCV list for selectivity estimates
 449  *
 450  * Determine the fraction of the variable's MCV population that satisfies
 451  * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.  Also
 452  * compute the fraction of the total column population represented by the MCV
 453  * list.  This code will work for any boolean-returning predicate operator.
 454  *
 455  * The function result is the MCV selectivity, and the fraction of the
 456  * total population is returned into *sumcommonp.  Zeroes are returned
 457  * if there is no MCV list.
 458  */
 459 double
 460 mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 461                                 Datum constval, bool varonleft,
 462                                 double *sumcommonp)
 463 {
 464         double          mcv_selec,
 465                                 sumcommon;
 466         Datum      *values;
 467         int                     nvalues;
 468         float4     *numbers;
 469         int                     nnumbers;
 470         int                     i;
 471
 472         mcv_selec = 0.0;
 473         sumcommon = 0.0;
 474
 475         if (HeapTupleIsValid(vardata->statsTuple) &&
 476                 get_attstatsslot(vardata->statsTuple,
 477                                                  vardata->atttype, vardata->atttypmod,
 478                                                  STATISTIC_KIND_MCV, InvalidOid,
 479                                                  &values, &nvalues,
 480                                                  &numbers, &nnumbers))
 481         {
 482                 for (i = 0; i < nvalues; i++)
 483                 {
 484                         if (varonleft ?
 485                                 DatumGetBool(FunctionCall2(opproc,
 486                                                                                    values[i],
 487                                                                                    constval)) :
 488                                 DatumGetBool(FunctionCall2(opproc,
 489                                                                                    constval,
 490                                                                                    values[i])))
 491                                 mcv_selec += numbers[i];
 492                         sumcommon += numbers[i];
 493                 }
 494                 free_attstatsslot(vardata->atttype, values, nvalues,
 495                                                   numbers, nnumbers);
 496         }
 497
 498         *sumcommonp = sumcommon;
 499         return mcv_selec;
 500 }
 501
 502 /*
 503  *      histogram_selectivity   - Examine the histogram for selectivity estimates
 504  *
 505  * Determine the fraction of the variable's histogram entries that satisfy
 506  * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
 507  *
 508  * This code will work for any boolean-returning predicate operator, whether
 509  * or not it has anything to do with the histogram sort operator.  We are
 510  * essentially using the histogram just as a representative sample.  However,
 511  * small histograms are unlikely to be all that representative, so the caller
 512  * should specify a minimum histogram size to use, and fall back on some
 513  * other approach if this routine fails.
 514  *
 515  * The caller also specifies n_skip, which causes us to ignore the first and
 516  * last n_skip histogram elements, on the grounds that they are outliers and
 517  * hence not very representative.  If in doubt, min_hist_size = 100 and
 518  * n_skip = 1 are reasonable values.
 519  *
 520  * The function result is the selectivity, or -1 if there is no histogram
 521  * or it's smaller than min_hist_size.
 522  *
 523  * Note that the result disregards both the most-common-values (if any) and
 524  * null entries.  The caller is expected to combine this result with
 525  * statistics for those portions of the column population.      It may also be
 526  * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
 527  */
 528 double
 529 histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 530                                           Datum constval, bool varonleft,
 531                                           int min_hist_size, int n_skip)
 532 {
 533         double          result;
 534         Datum      *values;
 535         int                     nvalues;
 536
 537         /* check sanity of parameters */
 538         Assert(n_skip >= 0);
 539         Assert(min_hist_size > 2 * n_skip);
 540
 541         if (HeapTupleIsValid(vardata->statsTuple) &&
 542                 get_attstatsslot(vardata->statsTuple,
 543                                                  vardata->atttype, vardata->atttypmod,
 544                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 545                                                  &values, &nvalues,
 546                                                  NULL, NULL))
 547         {
 548                 if (nvalues >= min_hist_size)
 549                 {
 550                         int                     nmatch = 0;
 551                         int                     i;
 552
 553                         for (i = n_skip; i < nvalues - n_skip; i++)
 554                         {
 555                                 if (varonleft ?
 556                                         DatumGetBool(FunctionCall2(opproc,
 557                                                                                            values[i],
 558                                                                                            constval)) :
 559                                         DatumGetBool(FunctionCall2(opproc,
 560                                                                                            constval,
 561                                                                                            values[i])))
 562                                         nmatch++;
 563                         }
 564                         result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
 565                 }
 566                 else
 567                         result = -1;
 568                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 569         }
 570         else
 571                 result = -1;
 572
 573         return result;
 574 }
 575
 576 /*
 577  *      ineq_histogram_selectivity      - Examine the histogram for scalarineqsel
 578  *
 579  * Determine the fraction of the variable's histogram population that
 580  * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
 581  *
 582  * Returns zero if there is no histogram (valid results will always be
 583  * greater than zero).
 584  *
 585  * Note that the result disregards both the most-common-values (if any) and
 586  * null entries.  The caller is expected to combine this result with
 587  * statistics for those portions of the column population.
 588  */
 589 static double
 590 ineq_histogram_selectivity(VariableStatData *vardata,
 591                                                    FmgrInfo *opproc, bool isgt,
 592                                                    Datum constval, Oid consttype)
 593 {
 594         double          hist_selec;
 595         Datum      *values;
 596         int                     nvalues;
 597
 598         hist_selec = 0.0;
 599
 600         /*
 601          * Someday, ANALYZE might store more than one histogram per rel/att,
 602          * corresponding to more than one possible sort ordering defined for the
 603          * column type.  However, to make that work we will need to figure out
 604          * which staop to search for --- it's not necessarily the one we have at
 605          * hand!  (For example, we might have a '<=' operator rather than the '<'
 606          * operator that will appear in staop.)  For now, assume that whatever
 607          * appears in pg_statistic is sorted the same way our operator sorts, or
 608          * the reverse way if isgt is TRUE.
 609          */
 610         if (HeapTupleIsValid(vardata->statsTuple) &&
 611                 get_attstatsslot(vardata->statsTuple,
 612                                                  vardata->atttype, vardata->atttypmod,
 613                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 614                                                  &values, &nvalues,
 615                                                  NULL, NULL))
 616         {
 617                 if (nvalues > 1)
 618                 {
 619                         /*
 620                          * Use binary search to find proper location, ie, the first slot
 621                          * at which the comparison fails.  (If the given operator isn't
 622                          * actually sort-compatible with the histogram, you'll get garbage
 623                          * results ... but probably not any more garbage-y than you would
 624                          * from the old linear search.)
 625                          */
 626                         double          histfrac;
 627                         int                     lobound = 0;    /* first possible slot to search */
 628                         int                     hibound = nvalues;              /* last+1 slot to search */
 629
 630                         while (lobound < hibound)
 631                         {
 632                                 int                     probe = (lobound + hibound) / 2;
 633                                 bool            ltcmp;
 634
 635                                 ltcmp = DatumGetBool(FunctionCall2(opproc,
 636                                                                                                    values[probe],
 637                                                                                                    constval));
 638                                 if (isgt)
 639                                         ltcmp = !ltcmp;
 640                                 if (ltcmp)
 641                                         lobound = probe + 1;
 642                                 else
 643                                         hibound = probe;
 644                         }
 645
 646                         if (lobound <= 0)
 647                         {
 648                                 /* Constant is below lower histogram boundary. */
 649                                 histfrac = 0.0;
 650                         }
 651                         else if (lobound >= nvalues)
 652                         {
 653                                 /* Constant is above upper histogram boundary. */
 654                                 histfrac = 1.0;
 655                         }
 656                         else
 657                         {
 658                                 int                     i = lobound;
 659                                 double          val,
 660                                                         high,
 661                                                         low;
 662                                 double          binfrac;
 663
 664                                 /*
 665                                  * We have values[i-1] < constant < values[i].
 666                                  *
 667                                  * Convert the constant and the two nearest bin boundary
 668                                  * values to a uniform comparison scale, and do a linear
 669                                  * interpolation within this bin.
 670                                  */
 671                                 if (convert_to_scalar(constval, consttype, &val,
 672                                                                           values[i - 1], values[i],
 673                                                                           vardata->vartype,
 674                                                                           &low, &high))
 675                                 {
 676                                         if (high <= low)
 677                                         {
 678                                                 /* cope if bin boundaries appear identical */
 679                                                 binfrac = 0.5;
 680                                         }
 681                                         else if (val <= low)
 682                                                 binfrac = 0.0;
 683                                         else if (val >= high)
 684                                                 binfrac = 1.0;
 685                                         else
 686                                         {
 687                                                 binfrac = (val - low) / (high - low);
 688
 689                                                 /*
 690                                                  * Watch out for the possibility that we got a NaN or
 691                                                  * Infinity from the division.  This can happen
 692                                                  * despite the previous checks, if for example "low"
 693                                                  * is -Infinity.
 694                                                  */
 695                                                 if (isnan(binfrac) ||
 696                                                         binfrac < 0.0 || binfrac > 1.0)
 697                                                         binfrac = 0.5;
 698                                         }
 699                                 }
 700                                 else
 701                                 {
 702                                         /*
 703                                          * Ideally we'd produce an error here, on the grounds that
 704                                          * the given operator shouldn't have scalarXXsel
 705                                          * registered as its selectivity func unless we can deal
 706                                          * with its operand types.      But currently, all manner of
 707                                          * stuff is invoking scalarXXsel, so give a default
 708                                          * estimate until that can be fixed.
 709                                          */
 710                                         binfrac = 0.5;
 711                                 }
 712
 713                                 /*
 714                                  * Now, compute the overall selectivity across the values
 715                                  * represented by the histogram.  We have i-1 full bins and
 716                                  * binfrac partial bin below the constant.
 717                                  */
 718                                 histfrac = (double) (i - 1) + binfrac;
 719                                 histfrac /= (double) (nvalues - 1);
 720                         }
 721
 722                         /*
 723                          * Now histfrac = fraction of histogram entries below the
 724                          * constant.
 725                          *
 726                          * Account for "<" vs ">"
 727                          */
 728                         hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 729
 730                         /*
 731                          * The histogram boundaries are only approximate to begin with,
 732                          * and may well be out of date anyway.  Therefore, don't believe
 733                          * extremely small or large selectivity estimates.
 734                          */
 735                         if (hist_selec < 0.0001)
 736                                 hist_selec = 0.0001;
 737                         else if (hist_selec > 0.9999)
 738                                 hist_selec = 0.9999;
 739                 }
 740
 741                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 742         }
 743
 744         return hist_selec;
 745 }
 746
 747 /*
 748  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 749  */
 750 Datum
 751 scalarltsel(PG_FUNCTION_ARGS)
 752 {
 753         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 754         Oid                     operator = PG_GETARG_OID(1);
 755         List       *args = (List *) PG_GETARG_POINTER(2);
 756         int                     varRelid = PG_GETARG_INT32(3);
 757         VariableStatData vardata;
 758         Node       *other;
 759         bool            varonleft;
 760         Datum           constval;
 761         Oid                     consttype;
 762         bool            isgt;
 763         double          selec;
 764
 765         /*
 766          * If expression is not variable op something or something op variable,
 767          * then punt and return a default estimate.
 768          */
 769         if (!get_restriction_variable(root, args, varRelid,
 770                                                                   &vardata, &other, &varonleft))
 771                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 772
 773         /*
 774          * Can't do anything useful if the something is not a constant, either.
 775          */
 776         if (!IsA(other, Const))
 777         {
 778                 ReleaseVariableStats(vardata);
 779                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 780         }
 781
 782         /*
 783          * If the constant is NULL, assume operator is strict and return zero, ie,
 784          * operator will never return TRUE.
 785          */
 786         if (((Const *) other)->constisnull)
 787         {
 788                 ReleaseVariableStats(vardata);
 789                 PG_RETURN_FLOAT8(0.0);
 790         }
 791         constval = ((Const *) other)->constvalue;
 792         consttype = ((Const *) other)->consttype;
 793
 794         /*
 795          * Force the var to be on the left to simplify logic in scalarineqsel.
 796          */
 797         if (varonleft)
 798         {
 799                 /* we have var < other */
 800                 isgt = false;
 801         }
 802         else
 803         {
 804                 /* we have other < var, commute to make var > other */
 805                 operator = get_commutator(operator);
 806                 if (!operator)
 807                 {
 808                         /* Use default selectivity (should we raise an error instead?) */
 809                         ReleaseVariableStats(vardata);
 810                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 811                 }
 812                 isgt = true;
 813         }
 814
 815         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 816
 817         ReleaseVariableStats(vardata);
 818
 819         PG_RETURN_FLOAT8((float8) selec);
 820 }
 821
 822 /*
 823  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 824  */
 825 Datum
 826 scalargtsel(PG_FUNCTION_ARGS)
 827 {
 828         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 829         Oid                     operator = PG_GETARG_OID(1);
 830         List       *args = (List *) PG_GETARG_POINTER(2);
 831         int                     varRelid = PG_GETARG_INT32(3);
 832         VariableStatData vardata;
 833         Node       *other;
 834         bool            varonleft;
 835         Datum           constval;
 836         Oid                     consttype;
 837         bool            isgt;
 838         double          selec;
 839
 840         /*
 841          * If expression is not variable op something or something op variable,
 842          * then punt and return a default estimate.
 843          */
 844         if (!get_restriction_variable(root, args, varRelid,
 845                                                                   &vardata, &other, &varonleft))
 846                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 847
 848         /*
 849          * Can't do anything useful if the something is not a constant, either.
 850          */
 851         if (!IsA(other, Const))
 852         {
 853                 ReleaseVariableStats(vardata);
 854                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 855         }
 856
 857         /*
 858          * If the constant is NULL, assume operator is strict and return zero, ie,
 859          * operator will never return TRUE.
 860          */
 861         if (((Const *) other)->constisnull)
 862         {
 863                 ReleaseVariableStats(vardata);
 864                 PG_RETURN_FLOAT8(0.0);
 865         }
 866         constval = ((Const *) other)->constvalue;
 867         consttype = ((Const *) other)->consttype;
 868
 869         /*
 870          * Force the var to be on the left to simplify logic in scalarineqsel.
 871          */
 872         if (varonleft)
 873         {
 874                 /* we have var > other */
 875                 isgt = true;
 876         }
 877         else
 878         {
 879                 /* we have other > var, commute to make var < other */
 880                 operator = get_commutator(operator);
 881                 if (!operator)
 882                 {
 883                         /* Use default selectivity (should we raise an error instead?) */
 884                         ReleaseVariableStats(vardata);
 885                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 886                 }
 887                 isgt = false;
 888         }
 889
 890         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 891
 892         ReleaseVariableStats(vardata);
 893
 894         PG_RETURN_FLOAT8((float8) selec);
 895 }
 896
 897 /*
 898  * patternsel                   - Generic code for pattern-match selectivity.
 899  */
 900 static double
 901 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 902 {
 903         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 904         Oid                     operator = PG_GETARG_OID(1);
 905         List       *args = (List *) PG_GETARG_POINTER(2);
 906         int                     varRelid = PG_GETARG_INT32(3);
 907         VariableStatData vardata;
 908         Node       *variable;
 909         Node       *other;
 910         bool            varonleft;
 911         Datum           constval;
 912         Oid                     consttype;
 913         Oid                     vartype;
 914         Oid                     opfamily;
 915         Pattern_Prefix_Status pstatus;
 916         Const      *patt = NULL;
 917         Const      *prefix = NULL;
 918         Const      *rest = NULL;
 919         double          result;
 920
 921         /*
 922          * If expression is not variable op constant, then punt and return a
 923          * default estimate.
 924          */
 925         if (!get_restriction_variable(root, args, varRelid,
 926                                                                   &vardata, &other, &varonleft))
 927                 return DEFAULT_MATCH_SEL;
 928         if (!varonleft || !IsA(other, Const))
 929         {
 930                 ReleaseVariableStats(vardata);
 931                 return DEFAULT_MATCH_SEL;
 932         }
 933         variable = (Node *) linitial(args);
 934
 935         /*
 936          * If the constant is NULL, assume operator is strict and return zero, ie,
 937          * operator will never return TRUE.
 938          */
 939         if (((Const *) other)->constisnull)
 940         {
 941                 ReleaseVariableStats(vardata);
 942                 return 0.0;
 943         }
 944         constval = ((Const *) other)->constvalue;
 945         consttype = ((Const *) other)->consttype;
 946
 947         /*
 948          * The right-hand const is type text or bytea for all supported operators.
 949          * We do not expect to see binary-compatible types here, since
 950          * const-folding should have relabeled the const to exactly match the
 951          * operator's declared type.
 952          */
 953         if (consttype != TEXTOID && consttype != BYTEAOID)
 954         {
 955                 ReleaseVariableStats(vardata);
 956                 return DEFAULT_MATCH_SEL;
 957         }
 958
 959         /*
 960          * Similarly, the exposed type of the left-hand side should be one of
 961          * those we know.  (Do not look at vardata.atttype, which might be
 962          * something binary-compatible but different.)  We can use it to choose
 963          * the index opfamily from which we must draw the comparison operators.
 964          *
 965          * NOTE: It would be more correct to use the PATTERN opfamilies than the
 966          * simple ones, but at the moment ANALYZE will not generate statistics for
 967          * the PATTERN operators.  But our results are so approximate anyway that
 968          * it probably hardly matters.
 969          */
 970         vartype = vardata.vartype;
 971
 972         switch (vartype)
 973         {
 974                 case TEXTOID:
 975                         opfamily = TEXT_BTREE_FAM_OID;
 976                         break;
 977                 case BPCHAROID:
 978                         opfamily = BPCHAR_BTREE_FAM_OID;
 979                         break;
 980                 case NAMEOID:
 981                         opfamily = NAME_BTREE_FAM_OID;
 982                         break;
 983                 case BYTEAOID:
 984                         opfamily = BYTEA_BTREE_FAM_OID;
 985                         break;
 986                 default:
 987                         ReleaseVariableStats(vardata);
 988                         return DEFAULT_MATCH_SEL;
 989         }
 990
 991         /* divide pattern into fixed prefix and remainder */
 992         patt = (Const *) other;
 993         pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
 994
 995         /*
 996          * If necessary, coerce the prefix constant to the right type. (The "rest"
 997          * constant need not be changed.)
 998          */
 999         if (prefix && prefix->consttype != vartype)
1000         {
1001                 char       *prefixstr;
1002
1003                 switch (prefix->consttype)
1004                 {
1005                         case TEXTOID:
1006                                 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
1007                                                                                                                 prefix->constvalue));
1008                                 break;
1009                         case BYTEAOID:
1010                                 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
1011                                                                                                                 prefix->constvalue));
1012                                 break;
1013                         default:
1014                                 elog(ERROR, "unrecognized consttype: %u",
1015                                          prefix->consttype);
1016                                 ReleaseVariableStats(vardata);
1017                                 return DEFAULT_MATCH_SEL;
1018                 }
1019                 prefix = string_to_const(prefixstr, vartype);
1020                 pfree(prefixstr);
1021         }
1022
1023         if (pstatus == Pattern_Prefix_Exact)
1024         {
1025                 /*
1026                  * Pattern specifies an exact match, so pretend operator is '='
1027                  */
1028                 Oid                     eqopr = get_opfamily_member(opfamily, vartype, vartype,
1029                                                                                                 BTEqualStrategyNumber);
1030                 List       *eqargs;
1031
1032                 if (eqopr == InvalidOid)
1033                         elog(ERROR, "no = operator for opfamily %u", opfamily);
1034                 eqargs = list_make2(variable, prefix);
1035                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
1036                                                                                                         PointerGetDatum(root),
1037                                                                                                         ObjectIdGetDatum(eqopr),
1038                                                                                                         PointerGetDatum(eqargs),
1039                                                                                                         Int32GetDatum(varRelid)));
1040         }
1041         else
1042         {
1043                 /*
1044                  * Not exact-match pattern.  If we have a sufficiently large
1045                  * histogram, estimate selectivity for the histogram part of the
1046                  * population by counting matches in the histogram.  If not, estimate
1047                  * selectivity of the fixed prefix and remainder of pattern
1048                  * separately, then combine the two to get an estimate of the
1049                  * selectivity for the part of the column population represented by
1050                  * the histogram.  We then add up data for any most-common-values
1051                  * values; these are not in the histogram population, and we can get
1052                  * exact answers for them by applying the pattern operator, so there's
1053                  * no reason to approximate.  (If the MCVs cover a significant part of
1054                  * the total population, this gives us a big leg up in accuracy.)
1055                  */
1056                 Selectivity selec;
1057                 FmgrInfo        opproc;
1058                 double          nullfrac,
1059                                         mcv_selec,
1060                                         sumcommon;
1061
1062                 /* Try to use the histogram entries to get selectivity */
1063                 fmgr_info(get_opcode(operator), &opproc);
1064
1065                 selec = histogram_selectivity(&vardata, &opproc, constval, true,
1066                                                                           100, 1);
1067                 if (selec < 0)
1068                 {
1069                         /* Nope, so fake it with the heuristic method */
1070                         Selectivity prefixsel;
1071                         Selectivity restsel;
1072
1073                         if (pstatus == Pattern_Prefix_Partial)
1074                                 prefixsel = prefix_selectivity(&vardata, vartype,
1075                                                                                            opfamily, prefix);
1076                         else
1077                                 prefixsel = 1.0;
1078                         restsel = pattern_selectivity(rest, ptype);
1079                         selec = prefixsel * restsel;
1080                 }
1081                 else
1082                 {
1083                         /* Yes, but don't believe extremely small or large estimates. */
1084                         if (selec < 0.0001)
1085                                 selec = 0.0001;
1086                         else if (selec > 0.9999)
1087                                 selec = 0.9999;
1088                 }
1089
1090                 /*
1091                  * If we have most-common-values info, add up the fractions of the MCV
1092                  * entries that satisfy MCV OP PATTERN.  These fractions contribute
1093                  * directly to the result selectivity.  Also add up the total fraction
1094                  * represented by MCV entries.
1095                  */
1096                 mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
1097                                                                         &sumcommon);
1098
1099                 if (HeapTupleIsValid(vardata.statsTuple))
1100                         nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
1101                 else
1102                         nullfrac = 0.0;
1103
1104                 /*
1105                  * Now merge the results from the MCV and histogram calculations,
1106                  * realizing that the histogram covers only the non-null values that
1107                  * are not listed in MCV.
1108                  */
1109                 selec *= 1.0 - nullfrac - sumcommon;
1110                 selec += mcv_selec;
1111
1112                 /* result should be in range, but make sure... */
1113                 CLAMP_PROBABILITY(selec);
1114                 result = selec;
1115         }
1116
1117         if (prefix)
1118         {
1119                 pfree(DatumGetPointer(prefix->constvalue));
1120                 pfree(prefix);
1121         }
1122
1123         ReleaseVariableStats(vardata);
1124
1125         return result;
1126 }
1127
1128 /*
1129  *              regexeqsel              - Selectivity of regular-expression pattern match.
1130  */
1131 Datum
1132 regexeqsel(PG_FUNCTION_ARGS)
1133 {
1134         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
1135 }
1136
1137 /*
1138  *              icregexeqsel    - Selectivity of case-insensitive regex match.
1139  */
1140 Datum
1141 icregexeqsel(PG_FUNCTION_ARGS)
1142 {
1143         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1144 }
1145
1146 /*
1147  *              likesel                 - Selectivity of LIKE pattern match.
1148  */
1149 Datum
1150 likesel(PG_FUNCTION_ARGS)
1151 {
1152         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1153 }
1154
1155 /*
1156  *              iclikesel                       - Selectivity of ILIKE pattern match.
1157  */
1158 Datum
1159 iclikesel(PG_FUNCTION_ARGS)
1160 {
1161         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1162 }
1163
1164 /*
1165  *              regexnesel              - Selectivity of regular-expression pattern non-match.
1166  */
1167 Datum
1168 regexnesel(PG_FUNCTION_ARGS)
1169 {
1170         double          result;
1171
1172         result = patternsel(fcinfo, Pattern_Type_Regex);
1173         result = 1.0 - result;
1174         PG_RETURN_FLOAT8(result);
1175 }
1176
1177 /*
1178  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
1179  */
1180 Datum
1181 icregexnesel(PG_FUNCTION_ARGS)
1182 {
1183         double          result;
1184
1185         result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1186         result = 1.0 - result;
1187         PG_RETURN_FLOAT8(result);
1188 }
1189
1190 /*
1191  *              nlikesel                - Selectivity of LIKE pattern non-match.
1192  */
1193 Datum
1194 nlikesel(PG_FUNCTION_ARGS)
1195 {
1196         double          result;
1197
1198         result = patternsel(fcinfo, Pattern_Type_Like);
1199         result = 1.0 - result;
1200         PG_RETURN_FLOAT8(result);
1201 }
1202
1203 /*
1204  *              icnlikesel              - Selectivity of ILIKE pattern non-match.
1205  */
1206 Datum
1207 icnlikesel(PG_FUNCTION_ARGS)
1208 {
1209         double          result;
1210
1211         result = patternsel(fcinfo, Pattern_Type_Like_IC);
1212         result = 1.0 - result;
1213         PG_RETURN_FLOAT8(result);
1214 }
1215
1216 /*
1217  *              booltestsel             - Selectivity of BooleanTest Node.
1218  */
1219 Selectivity
1220 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1221                         int varRelid, JoinType jointype)
1222 {
1223         VariableStatData vardata;
1224         double          selec;
1225
1226         examine_variable(root, arg, varRelid, &vardata);
1227
1228         if (HeapTupleIsValid(vardata.statsTuple))
1229         {
1230                 Form_pg_statistic stats;
1231                 double          freq_null;
1232                 Datum      *values;
1233                 int                     nvalues;
1234                 float4     *numbers;
1235                 int                     nnumbers;
1236
1237                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1238                 freq_null = stats->stanullfrac;
1239
1240                 if (get_attstatsslot(vardata.statsTuple,
1241                                                          vardata.atttype, vardata.atttypmod,
1242                                                          STATISTIC_KIND_MCV, InvalidOid,
1243                                                          &values, &nvalues,
1244                                                          &numbers, &nnumbers)
1245                         && nnumbers > 0)
1246                 {
1247                         double          freq_true;
1248                         double          freq_false;
1249
1250                         /*
1251                          * Get first MCV frequency and derive frequency for true.
1252                          */
1253                         if (DatumGetBool(values[0]))
1254                                 freq_true = numbers[0];
1255                         else
1256                                 freq_true = 1.0 - numbers[0] - freq_null;
1257
1258                         /*
1259                          * Next derive frequency for false. Then use these as appropriate
1260                          * to derive frequency for each case.
1261                          */
1262                         freq_false = 1.0 - freq_true - freq_null;
1263
1264                         switch (booltesttype)
1265                         {
1266                                 case IS_UNKNOWN:
1267                                         /* select only NULL values */
1268                                         selec = freq_null;
1269                                         break;
1270                                 case IS_NOT_UNKNOWN:
1271                                         /* select non-NULL values */
1272                                         selec = 1.0 - freq_null;
1273                                         break;
1274                                 case IS_TRUE:
1275                                         /* select only TRUE values */
1276                                         selec = freq_true;
1277                                         break;
1278                                 case IS_NOT_TRUE:
1279                                         /* select non-TRUE values */
1280                                         selec = 1.0 - freq_true;
1281                                         break;
1282                                 case IS_FALSE:
1283                                         /* select only FALSE values */
1284                                         selec = freq_false;
1285                                         break;
1286                                 case IS_NOT_FALSE:
1287                                         /* select non-FALSE values */
1288                                         selec = 1.0 - freq_false;
1289                                         break;
1290                                 default:
1291                                         elog(ERROR, "unrecognized booltesttype: %d",
1292                                                  (int) booltesttype);
1293                                         selec = 0.0;    /* Keep compiler quiet */
1294                                         break;
1295                         }
1296
1297                         free_attstatsslot(vardata.atttype, values, nvalues,
1298                                                           numbers, nnumbers);
1299                 }
1300                 else
1301                 {
1302                         /*
1303                          * No most-common-value info available. Still have null fraction
1304                          * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
1305                          * for null fraction and assume an even split for boolean tests.
1306                          */
1307                         switch (booltesttype)
1308                         {
1309                                 case IS_UNKNOWN:
1310
1311                                         /*
1312                                          * Use freq_null directly.
1313                                          */
1314                                         selec = freq_null;
1315                                         break;
1316                                 case IS_NOT_UNKNOWN:
1317
1318                                         /*
1319                                          * Select not unknown (not null) values. Calculate from
1320                                          * freq_null.
1321                                          */
1322                                         selec = 1.0 - freq_null;
1323                                         break;
1324                                 case IS_TRUE:
1325                                 case IS_NOT_TRUE:
1326                                 case IS_FALSE:
1327                                 case IS_NOT_FALSE:
1328                                         selec = (1.0 - freq_null) / 2.0;
1329                                         break;
1330                                 default:
1331                                         elog(ERROR, "unrecognized booltesttype: %d",
1332                                                  (int) booltesttype);
1333                                         selec = 0.0;    /* Keep compiler quiet */
1334                                         break;
1335                         }
1336                 }
1337         }
1338         else
1339         {
1340                 /*
1341                  * If we can't get variable statistics for the argument, perhaps
1342                  * clause_selectivity can do something with it.  We ignore the
1343                  * possibility of a NULL value when using clause_selectivity, and just
1344                  * assume the value is either TRUE or FALSE.
1345                  */
1346                 switch (booltesttype)
1347                 {
1348                         case IS_UNKNOWN:
1349                                 selec = DEFAULT_UNK_SEL;
1350                                 break;
1351                         case IS_NOT_UNKNOWN:
1352                                 selec = DEFAULT_NOT_UNK_SEL;
1353                                 break;
1354                         case IS_TRUE:
1355                         case IS_NOT_FALSE:
1356                                 selec = (double) clause_selectivity(root, arg,
1357                                                                                                         varRelid, jointype);
1358                                 break;
1359                         case IS_FALSE:
1360                         case IS_NOT_TRUE:
1361                                 selec = 1.0 - (double) clause_selectivity(root, arg,
1362                                                                                                                   varRelid, jointype);
1363                                 break;
1364                         default:
1365                                 elog(ERROR, "unrecognized booltesttype: %d",
1366                                          (int) booltesttype);
1367                                 selec = 0.0;    /* Keep compiler quiet */
1368                                 break;
1369                 }
1370         }
1371
1372         ReleaseVariableStats(vardata);
1373
1374         /* result should be in range, but make sure... */
1375         CLAMP_PROBABILITY(selec);
1376
1377         return (Selectivity) selec;
1378 }
1379
1380 /*
1381  *              nulltestsel             - Selectivity of NullTest Node.
1382  */
1383 Selectivity
1384 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1385                         Node *arg, int varRelid)
1386 {
1387         VariableStatData vardata;
1388         double          selec;
1389
1390         examine_variable(root, arg, varRelid, &vardata);
1391
1392         if (HeapTupleIsValid(vardata.statsTuple))
1393         {
1394                 Form_pg_statistic stats;
1395                 double          freq_null;
1396
1397                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1398                 freq_null = stats->stanullfrac;
1399
1400                 switch (nulltesttype)
1401                 {
1402                         case IS_NULL:
1403
1404                                 /*
1405                                  * Use freq_null directly.
1406                                  */
1407                                 selec = freq_null;
1408                                 break;
1409                         case IS_NOT_NULL:
1410
1411                                 /*
1412                                  * Select not unknown (not null) values. Calculate from
1413                                  * freq_null.
1414                                  */
1415                                 selec = 1.0 - freq_null;
1416                                 break;
1417                         default:
1418                                 elog(ERROR, "unrecognized nulltesttype: %d",
1419                                          (int) nulltesttype);
1420                                 return (Selectivity) 0; /* keep compiler quiet */
1421                 }
1422         }
1423         else
1424         {
1425                 /*
1426                  * No ANALYZE stats available, so make a guess
1427                  */
1428                 switch (nulltesttype)
1429                 {
1430                         case IS_NULL:
1431                                 selec = DEFAULT_UNK_SEL;
1432                                 break;
1433                         case IS_NOT_NULL:
1434                                 selec = DEFAULT_NOT_UNK_SEL;
1435                                 break;
1436                         default:
1437                                 elog(ERROR, "unrecognized nulltesttype: %d",
1438                                          (int) nulltesttype);
1439                                 return (Selectivity) 0; /* keep compiler quiet */
1440                 }
1441         }
1442
1443         ReleaseVariableStats(vardata);
1444
1445         /* result should be in range, but make sure... */
1446         CLAMP_PROBABILITY(selec);
1447
1448         return (Selectivity) selec;
1449 }
1450
1451 /*
1452  *              scalararraysel          - Selectivity of ScalarArrayOpExpr Node.
1453  */
1454 Selectivity
1455 scalararraysel(PlannerInfo *root,
1456                            ScalarArrayOpExpr *clause,
1457                            bool is_join_clause,
1458                            int varRelid, JoinType jointype)
1459 {
1460         Oid                     operator = clause->opno;
1461         bool            useOr = clause->useOr;
1462         Node       *leftop;
1463         Node       *rightop;
1464         RegProcedure oprsel;
1465         FmgrInfo        oprselproc;
1466         Datum           selarg4;
1467         Selectivity s1;
1468
1469         /*
1470          * First, look up the underlying operator's selectivity estimator. Punt if
1471          * it hasn't got one.
1472          */
1473         if (is_join_clause)
1474         {
1475                 oprsel = get_oprjoin(operator);
1476                 selarg4 = Int16GetDatum(jointype);
1477         }
1478         else
1479         {
1480                 oprsel = get_oprrest(operator);
1481                 selarg4 = Int32GetDatum(varRelid);
1482         }
1483         if (!oprsel)
1484                 return (Selectivity) 0.5;
1485         fmgr_info(oprsel, &oprselproc);
1486
1487         /*
1488          * We consider three cases:
1489          *
1490          * 1. rightop is an Array constant: deconstruct the array, apply the
1491          * operator's selectivity function for each array element, and merge the
1492          * results in the same way that clausesel.c does for AND/OR combinations.
1493          *
1494          * 2. rightop is an ARRAY[] construct: apply the operator's selectivity
1495          * function for each element of the ARRAY[] construct, and merge.
1496          *
1497          * 3. otherwise, make a guess ...
1498          */
1499         Assert(list_length(clause->args) == 2);
1500         leftop = (Node *) linitial(clause->args);
1501         rightop = (Node *) lsecond(clause->args);
1502
1503         if (rightop && IsA(rightop, Const))
1504         {
1505                 Datum           arraydatum = ((Const *) rightop)->constvalue;
1506                 bool            arrayisnull = ((Const *) rightop)->constisnull;
1507                 ArrayType  *arrayval;
1508                 int16           elmlen;
1509                 bool            elmbyval;
1510                 char            elmalign;
1511                 int                     num_elems;
1512                 Datum      *elem_values;
1513                 bool       *elem_nulls;
1514                 int                     i;
1515
1516                 if (arrayisnull)                /* qual can't succeed if null array */
1517                         return (Selectivity) 0.0;
1518                 arrayval = DatumGetArrayTypeP(arraydatum);
1519                 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
1520                                                          &elmlen, &elmbyval, &elmalign);
1521                 deconstruct_array(arrayval,
1522                                                   ARR_ELEMTYPE(arrayval),
1523                                                   elmlen, elmbyval, elmalign,
1524                                                   &elem_values, &elem_nulls, &num_elems);
1525                 s1 = useOr ? 0.0 : 1.0;
1526                 for (i = 0; i < num_elems; i++)
1527                 {
1528                         List       *args;
1529                         Selectivity s2;
1530
1531                         args = list_make2(leftop,
1532                                                           makeConst(ARR_ELEMTYPE(arrayval),
1533                                                                                 elmlen,
1534                                                                                 elem_values[i],
1535                                                                                 elem_nulls[i],
1536                                                                                 elmbyval));
1537                         s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1538                                                                                           PointerGetDatum(root),
1539                                                                                           ObjectIdGetDatum(operator),
1540                                                                                           PointerGetDatum(args),
1541                                                                                           selarg4));
1542                         if (useOr)
1543                                 s1 = s1 + s2 - s1 * s2;
1544                         else
1545                                 s1 = s1 * s2;
1546                 }
1547         }
1548         else if (rightop && IsA(rightop, ArrayExpr) &&
1549                          !((ArrayExpr *) rightop)->multidims)
1550         {
1551                 ArrayExpr  *arrayexpr = (ArrayExpr *) rightop;
1552                 int16           elmlen;
1553                 bool            elmbyval;
1554                 ListCell   *l;
1555
1556                 get_typlenbyval(arrayexpr->element_typeid,
1557                                                 &elmlen, &elmbyval);
1558                 s1 = useOr ? 0.0 : 1.0;
1559                 foreach(l, arrayexpr->elements)
1560                 {
1561                         List       *args;
1562                         Selectivity s2;
1563
1564                         args = list_make2(leftop, lfirst(l));
1565                         s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1566                                                                                           PointerGetDatum(root),
1567                                                                                           ObjectIdGetDatum(operator),
1568                                                                                           PointerGetDatum(args),
1569                                                                                           selarg4));
1570                         if (useOr)
1571                                 s1 = s1 + s2 - s1 * s2;
1572                         else
1573                                 s1 = s1 * s2;
1574                 }
1575         }
1576         else
1577         {
1578                 CaseTestExpr *dummyexpr;
1579                 List       *args;
1580                 Selectivity s2;
1581                 int                     i;
1582
1583                 /*
1584                  * We need a dummy rightop to pass to the operator selectivity
1585                  * routine.  It can be pretty much anything that doesn't look like a
1586                  * constant; CaseTestExpr is a convenient choice.
1587                  */
1588                 dummyexpr = makeNode(CaseTestExpr);
1589                 dummyexpr->typeId = get_element_type(exprType(rightop));
1590                 dummyexpr->typeMod = -1;
1591                 args = list_make2(leftop, dummyexpr);
1592                 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1593                                                                                   PointerGetDatum(root),
1594                                                                                   ObjectIdGetDatum(operator),
1595                                                                                   PointerGetDatum(args),
1596                                                                                   selarg4));
1597                 s1 = useOr ? 0.0 : 1.0;
1598
1599                 /*
1600                  * Arbitrarily assume 10 elements in the eventual array value (see
1601                  * also estimate_array_length)
1602                  */
1603                 for (i = 0; i < 10; i++)
1604                 {
1605                         if (useOr)
1606                                 s1 = s1 + s2 - s1 * s2;
1607                         else
1608                                 s1 = s1 * s2;
1609                 }
1610         }
1611
1612         /* result should be in range, but make sure... */
1613         CLAMP_PROBABILITY(s1);
1614
1615         return s1;
1616 }
1617
1618 /*
1619  * Estimate number of elements in the array yielded by an expression.
1620  *
1621  * It's important that this agree with scalararraysel.
1622  */
1623 int
1624 estimate_array_length(Node *arrayexpr)
1625 {
1626         if (arrayexpr && IsA(arrayexpr, Const))
1627         {
1628                 Datum           arraydatum = ((Const *) arrayexpr)->constvalue;
1629                 bool            arrayisnull = ((Const *) arrayexpr)->constisnull;
1630                 ArrayType  *arrayval;
1631
1632                 if (arrayisnull)
1633                         return 0;
1634                 arrayval = DatumGetArrayTypeP(arraydatum);
1635                 return ArrayGetNItems(ARR_NDIM(arrayval), ARR_DIMS(arrayval));
1636         }
1637         else if (arrayexpr && IsA(arrayexpr, ArrayExpr) &&
1638                          !((ArrayExpr *) arrayexpr)->multidims)
1639         {
1640                 return list_length(((ArrayExpr *) arrayexpr)->elements);
1641         }
1642         else
1643         {
1644                 /* default guess --- see also scalararraysel */
1645                 return 10;
1646         }
1647 }
1648
1649 /*
1650  *              rowcomparesel           - Selectivity of RowCompareExpr Node.
1651  *
1652  * We estimate RowCompare selectivity by considering just the first (high
1653  * order) columns, which makes it equivalent to an ordinary OpExpr.  While
1654  * this estimate could be refined by considering additional columns, it
1655  * seems unlikely that we could do a lot better without multi-column
1656  * statistics.
1657  */
1658 Selectivity
1659 rowcomparesel(PlannerInfo *root,
1660                           RowCompareExpr *clause,
1661                           int varRelid, JoinType jointype)
1662 {
1663         Selectivity s1;
1664         Oid                     opno = linitial_oid(clause->opnos);
1665         List       *opargs;
1666         bool            is_join_clause;
1667
1668         /* Build equivalent arg list for single operator */
1669         opargs = list_make2(linitial(clause->largs), linitial(clause->rargs));
1670
1671         /* Decide if it's a join clause, same as for OpExpr */
1672         if (varRelid != 0)
1673         {
1674                 /*
1675                  * If we are considering a nestloop join then all clauses are
1676                  * restriction clauses, since we are only interested in the one
1677                  * relation.
1678                  */
1679                 is_join_clause = false;
1680         }
1681         else
1682         {
1683                 /*
1684                  * Otherwise, it's a join if there's more than one relation used.
1685                  * Notice we ignore the low-order columns here.
1686                  */
1687                 is_join_clause = (NumRelids((Node *) opargs) > 1);
1688         }
1689
1690         if (is_join_clause)
1691         {
1692                 /* Estimate selectivity for a join clause. */
1693                 s1 = join_selectivity(root, opno,
1694                                                           opargs,
1695                                                           jointype);
1696         }
1697         else
1698         {
1699                 /* Estimate selectivity for a restriction clause. */
1700                 s1 = restriction_selectivity(root, opno,
1701                                                                          opargs,
1702                                                                          varRelid);
1703         }
1704
1705         return s1;
1706 }
1707
1708 /*
1709  *              eqjoinsel               - Join selectivity of "="
1710  */
1711 Datum
1712 eqjoinsel(PG_FUNCTION_ARGS)
1713 {
1714         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1715         Oid                     operator = PG_GETARG_OID(1);
1716         List       *args = (List *) PG_GETARG_POINTER(2);
1717         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1718         double          selec;
1719         VariableStatData vardata1;
1720         VariableStatData vardata2;
1721         double          nd1;
1722         double          nd2;
1723         Form_pg_statistic stats1 = NULL;
1724         Form_pg_statistic stats2 = NULL;
1725         bool            have_mcvs1 = false;
1726         Datum      *values1 = NULL;
1727         int                     nvalues1 = 0;
1728         float4     *numbers1 = NULL;
1729         int                     nnumbers1 = 0;
1730         bool            have_mcvs2 = false;
1731         Datum      *values2 = NULL;
1732         int                     nvalues2 = 0;
1733         float4     *numbers2 = NULL;
1734         int                     nnumbers2 = 0;
1735
1736         get_join_variables(root, args, &vardata1, &vardata2);
1737
1738         nd1 = get_variable_numdistinct(&vardata1);
1739         nd2 = get_variable_numdistinct(&vardata2);
1740
1741         if (HeapTupleIsValid(vardata1.statsTuple))
1742         {
1743                 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1744                 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1745                                                                           vardata1.atttype,
1746                                                                           vardata1.atttypmod,
1747                                                                           STATISTIC_KIND_MCV,
1748                                                                           InvalidOid,
1749                                                                           &values1, &nvalues1,
1750                                                                           &numbers1, &nnumbers1);
1751         }
1752
1753         if (HeapTupleIsValid(vardata2.statsTuple))
1754         {
1755                 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1756                 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1757                                                                           vardata2.atttype,
1758                                                                           vardata2.atttypmod,
1759                                                                           STATISTIC_KIND_MCV,
1760                                                                           InvalidOid,
1761                                                                           &values2, &nvalues2,
1762                                                                           &numbers2, &nnumbers2);
1763         }
1764
1765         if (have_mcvs1 && have_mcvs2)
1766         {
1767                 /*
1768                  * We have most-common-value lists for both relations.  Run through
1769                  * the lists to see which MCVs actually join to each other with the
1770                  * given operator.      This allows us to determine the exact join
1771                  * selectivity for the portion of the relations represented by the MCV
1772                  * lists.  We still have to estimate for the remaining population, but
1773                  * in a skewed distribution this gives us a big leg up in accuracy.
1774                  * For motivation see the analysis in Y. Ioannidis and S.
1775                  * Christodoulakis, "On the propagation of errors in the size of join
1776                  * results", Technical Report 1018, Computer Science Dept., University
1777                  * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
1778                  */
1779                 FmgrInfo        eqproc;
1780                 bool       *hasmatch1;
1781                 bool       *hasmatch2;
1782                 double          nullfrac1 = stats1->stanullfrac;
1783                 double          nullfrac2 = stats2->stanullfrac;
1784                 double          matchprodfreq,
1785                                         matchfreq1,
1786                                         matchfreq2,
1787                                         unmatchfreq1,
1788                                         unmatchfreq2,
1789                                         otherfreq1,
1790                                         otherfreq2,
1791                                         totalsel1,
1792                                         totalsel2;
1793                 int                     i,
1794                                         nmatches;
1795
1796                 fmgr_info(get_opcode(operator), &eqproc);
1797                 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1798                 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1799
1800                 /*
1801                  * If we are doing any variant of JOIN_IN, pretend all the values of
1802                  * the righthand relation are unique (ie, act as if it's been
1803                  * DISTINCT'd).
1804                  *
1805                  * NOTE: it might seem that we should unique-ify the lefthand input
1806                  * when considering JOIN_REVERSE_IN.  But this is not so, because the
1807                  * join clause we've been handed has not been commuted from the way
1808                  * the parser originally wrote it.      We know that the unique side of
1809                  * the IN clause is *always* on the right.
1810                  *
1811                  * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1812                  * JOIN_RIGHT here, because we do not have enough information to
1813                  * determine which var is really on which side of the join. Perhaps
1814                  * someday we should pass in more information.
1815                  */
1816                 if (jointype == JOIN_IN ||
1817                         jointype == JOIN_REVERSE_IN ||
1818                         jointype == JOIN_UNIQUE_INNER ||
1819                         jointype == JOIN_UNIQUE_OUTER)
1820                 {
1821                         float4          oneovern = 1.0 / nd2;
1822
1823                         for (i = 0; i < nvalues2; i++)
1824                                 numbers2[i] = oneovern;
1825                         nullfrac2 = oneovern;
1826                 }
1827
1828                 /*
1829                  * Note we assume that each MCV will match at most one member of the
1830                  * other MCV list.      If the operator isn't really equality, there could
1831                  * be multiple matches --- but we don't look for them, both for speed
1832                  * and because the math wouldn't add up...
1833                  */
1834                 matchprodfreq = 0.0;
1835                 nmatches = 0;
1836                 for (i = 0; i < nvalues1; i++)
1837                 {
1838                         int                     j;
1839
1840                         for (j = 0; j < nvalues2; j++)
1841                         {
1842                                 if (hasmatch2[j])
1843                                         continue;
1844                                 if (DatumGetBool(FunctionCall2(&eqproc,
1845                                                                                            values1[i],
1846                                                                                            values2[j])))
1847                                 {
1848                                         hasmatch1[i] = hasmatch2[j] = true;
1849                                         matchprodfreq += numbers1[i] * numbers2[j];
1850                                         nmatches++;
1851                                         break;
1852                                 }
1853                         }
1854                 }
1855                 CLAMP_PROBABILITY(matchprodfreq);
1856                 /* Sum up frequencies of matched and unmatched MCVs */
1857                 matchfreq1 = unmatchfreq1 = 0.0;
1858                 for (i = 0; i < nvalues1; i++)
1859                 {
1860                         if (hasmatch1[i])
1861                                 matchfreq1 += numbers1[i];
1862                         else
1863                                 unmatchfreq1 += numbers1[i];
1864                 }
1865                 CLAMP_PROBABILITY(matchfreq1);
1866                 CLAMP_PROBABILITY(unmatchfreq1);
1867                 matchfreq2 = unmatchfreq2 = 0.0;
1868                 for (i = 0; i < nvalues2; i++)
1869                 {
1870                         if (hasmatch2[i])
1871                                 matchfreq2 += numbers2[i];
1872                         else
1873                                 unmatchfreq2 += numbers2[i];
1874                 }
1875                 CLAMP_PROBABILITY(matchfreq2);
1876                 CLAMP_PROBABILITY(unmatchfreq2);
1877                 pfree(hasmatch1);
1878                 pfree(hasmatch2);
1879
1880                 /*
1881                  * Compute total frequency of non-null values that are not in the MCV
1882                  * lists.
1883                  */
1884                 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1885                 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1886                 CLAMP_PROBABILITY(otherfreq1);
1887                 CLAMP_PROBABILITY(otherfreq2);
1888
1889                 /*
1890                  * We can estimate the total selectivity from the point of view of
1891                  * relation 1 as: the known selectivity for matched MCVs, plus
1892                  * unmatched MCVs that are assumed to match against random members of
1893                  * relation 2's non-MCV population, plus non-MCV values that are
1894                  * assumed to match against random members of relation 2's unmatched
1895                  * MCVs plus non-MCV values.
1896                  */
1897                 totalsel1 = matchprodfreq;
1898                 if (nd2 > nvalues2)
1899                         totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1900                 if (nd2 > nmatches)
1901                         totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1902                                 (nd2 - nmatches);
1903                 /* Same estimate from the point of view of relation 2. */
1904                 totalsel2 = matchprodfreq;
1905                 if (nd1 > nvalues1)
1906                         totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1907                 if (nd1 > nmatches)
1908                         totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1909                                 (nd1 - nmatches);
1910
1911                 /*
1912                  * Use the smaller of the two estimates.  This can be justified in
1913                  * essentially the same terms as given below for the no-stats case: to
1914                  * a first approximation, we are estimating from the point of view of
1915                  * the relation with smaller nd.
1916                  */
1917                 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1918         }
1919         else
1920         {
1921                 /*
1922                  * We do not have MCV lists for both sides.  Estimate the join
1923                  * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This
1924                  * is plausible if we assume that the join operator is strict and the
1925                  * non-null values are about equally distributed: a given non-null
1926                  * tuple of rel1 will join to either zero or N2*(1-nullfrac2)/nd2 rows
1927                  * of rel2, so total join rows are at most
1928                  * N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join selectivity of
1929                  * not more than (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it
1930                  * is not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression
1931                  * with MIN() is an upper bound.  Using the MIN() means we estimate
1932                  * from the point of view of the relation with smaller nd (since the
1933                  * larger nd is determining the MIN).  It is reasonable to assume that
1934                  * most tuples in this rel will have join partners, so the bound is
1935                  * probably reasonably tight and should be taken as-is.
1936                  *
1937                  * XXX Can we be smarter if we have an MCV list for just one side? It
1938                  * seems that if we assume equal distribution for the other side, we
1939                  * end up with the same answer anyway.
1940                  */
1941                 double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
1942                 double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
1943
1944                 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
1945                 if (nd1 > nd2)
1946                         selec /= nd1;
1947                 else
1948                         selec /= nd2;
1949         }
1950
1951         if (have_mcvs1)
1952                 free_attstatsslot(vardata1.atttype, values1, nvalues1,
1953                                                   numbers1, nnumbers1);
1954         if (have_mcvs2)
1955                 free_attstatsslot(vardata2.atttype, values2, nvalues2,
1956                                                   numbers2, nnumbers2);
1957
1958         ReleaseVariableStats(vardata1);
1959         ReleaseVariableStats(vardata2);
1960
1961         CLAMP_PROBABILITY(selec);
1962
1963         PG_RETURN_FLOAT8((float8) selec);
1964 }
1965
1966 /*
1967  *              neqjoinsel              - Join selectivity of "!="
1968  */
1969 Datum
1970 neqjoinsel(PG_FUNCTION_ARGS)
1971 {
1972         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1973         Oid                     operator = PG_GETARG_OID(1);
1974         List       *args = (List *) PG_GETARG_POINTER(2);
1975         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1976         Oid                     eqop;
1977         float8          result;
1978
1979         /*
1980          * We want 1 - eqjoinsel() where the equality operator is the one
1981          * associated with this != operator, that is, its negator.
1982          */
1983         eqop = get_negator(operator);
1984         if (eqop)
1985         {
1986                 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
1987                                                                                                         PointerGetDatum(root),
1988                                                                                                         ObjectIdGetDatum(eqop),
1989                                                                                                         PointerGetDatum(args),
1990                                                                                                         Int16GetDatum(jointype)));
1991         }
1992         else
1993         {
1994                 /* Use default selectivity (should we raise an error instead?) */
1995                 result = DEFAULT_EQ_SEL;
1996         }
1997         result = 1.0 - result;
1998         PG_RETURN_FLOAT8(result);
1999 }
2000
2001 /*
2002  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
2003  */
2004 Datum
2005 scalarltjoinsel(PG_FUNCTION_ARGS)
2006 {
2007         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
2008 }
2009
2010 /*
2011  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
2012  */
2013 Datum
2014 scalargtjoinsel(PG_FUNCTION_ARGS)
2015 {
2016         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
2017 }
2018
2019 /*
2020  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
2021  */
2022 Datum
2023 regexeqjoinsel(PG_FUNCTION_ARGS)
2024 {
2025         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2026 }
2027
2028 /*
2029  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
2030  */
2031 Datum
2032 icregexeqjoinsel(PG_FUNCTION_ARGS)
2033 {
2034         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2035 }
2036
2037 /*
2038  *              likejoinsel                     - Join selectivity of LIKE pattern match.
2039  */
2040 Datum
2041 likejoinsel(PG_FUNCTION_ARGS)
2042 {
2043         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2044 }
2045
2046 /*
2047  *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
2048  */
2049 Datum
2050 iclikejoinsel(PG_FUNCTION_ARGS)
2051 {
2052         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
2053 }
2054
2055 /*
2056  *              regexnejoinsel  - Join selectivity of regex non-match.
2057  */
2058 Datum
2059 regexnejoinsel(PG_FUNCTION_ARGS)
2060 {
2061         float8          result;
2062
2063         result = DatumGetFloat8(regexeqjoinsel(fcinfo));
2064         result = 1.0 - result;
2065         PG_RETURN_FLOAT8(result);
2066 }
2067
2068 /*
2069  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
2070  */
2071 Datum
2072 icregexnejoinsel(PG_FUNCTION_ARGS)
2073 {
2074         float8          result;
2075
2076         result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
2077         result = 1.0 - result;
2078         PG_RETURN_FLOAT8(result);
2079 }
2080
2081 /*
2082  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
2083  */
2084 Datum
2085 nlikejoinsel(PG_FUNCTION_ARGS)
2086 {
2087         float8          result;
2088
2089         result = DatumGetFloat8(likejoinsel(fcinfo));
2090         result = 1.0 - result;
2091         PG_RETURN_FLOAT8(result);
2092 }
2093
2094 /*
2095  *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
2096  */
2097 Datum
2098 icnlikejoinsel(PG_FUNCTION_ARGS)
2099 {
2100         float8          result;
2101
2102         result = DatumGetFloat8(iclikejoinsel(fcinfo));
2103         result = 1.0 - result;
2104         PG_RETURN_FLOAT8(result);
2105 }
2106
2107 /*
2108  * mergejoinscansel                     - Scan selectivity of merge join.
2109  *
2110  * A merge join will stop as soon as it exhausts either input stream.
2111  * Therefore, if we can estimate the ranges of both input variables,
2112  * we can estimate how much of the input will actually be read.  This
2113  * can have a considerable impact on the cost when using indexscans.
2114  *
2115  * clause should be a clause already known to be mergejoinable.  opfamily and
2116  * strategy specify the sort ordering being used.
2117  *
2118  * *leftscan is set to the fraction of the left-hand variable expected
2119  * to be scanned (0 to 1), and similarly *rightscan for the right-hand
2120  * variable.
2121  */
2122 void
2123 mergejoinscansel(PlannerInfo *root, Node *clause,
2124                                  Oid opfamily, int strategy,
2125                                  Selectivity *leftscan,
2126                                  Selectivity *rightscan)
2127 {
2128         Node       *left,
2129                            *right;
2130         VariableStatData leftvar,
2131                                 rightvar;
2132         int                     op_strategy;
2133         Oid                     op_lefttype;
2134         Oid                     op_righttype;
2135         bool            op_recheck;
2136         Oid                     opno,
2137                                 lsortop,
2138                                 rsortop,
2139                                 leop,
2140                                 revleop;
2141         Datum           leftmax,
2142                                 rightmax;
2143         double          selec;
2144
2145         /* Set default results if we can't figure anything out. */
2146         *leftscan = *rightscan = 1.0;
2147
2148         /* Deconstruct the merge clause */
2149         if (!is_opclause(clause))
2150                 return;                                 /* shouldn't happen */
2151         opno = ((OpExpr *) clause)->opno;
2152         left = get_leftop((Expr *) clause);
2153         right = get_rightop((Expr *) clause);
2154         if (!right)
2155                 return;                                 /* shouldn't happen */
2156
2157         /* Look for stats for the inputs */
2158         examine_variable(root, left, 0, &leftvar);
2159         examine_variable(root, right, 0, &rightvar);
2160
2161         /* Extract the operator's declared left/right datatypes */
2162         get_op_opfamily_properties(opno, opfamily,
2163                                                            &op_strategy,
2164                                                            &op_lefttype,
2165                                                            &op_righttype,
2166                                                            &op_recheck);
2167         Assert(op_strategy == BTEqualStrategyNumber);
2168         Assert(!op_recheck);
2169
2170         /*
2171          * Look up the various operators we need.  If we don't find them all,
2172          * it probably means the opfamily is broken, but we cope anyway.
2173          */
2174         switch (strategy)
2175         {
2176                 case BTLessStrategyNumber:
2177                         lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
2178                                                                                   BTLessStrategyNumber);
2179                         rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
2180                                                                                   BTLessStrategyNumber);
2181                         leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
2182                                                                            BTLessEqualStrategyNumber);
2183                         revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
2184                                                                                   BTLessEqualStrategyNumber);
2185                         break;
2186                 case BTGreaterStrategyNumber:
2187                         /* descending-order case */
2188                         lsortop = get_opfamily_member(opfamily, op_lefttype, op_lefttype,
2189                                                                                   BTGreaterStrategyNumber);
2190                         rsortop = get_opfamily_member(opfamily, op_righttype, op_righttype,
2191                                                                                   BTGreaterStrategyNumber);
2192                         leop = get_opfamily_member(opfamily, op_lefttype, op_righttype,
2193                                                                            BTGreaterEqualStrategyNumber);
2194                         revleop = get_opfamily_member(opfamily, op_righttype, op_lefttype,
2195                                                                                   BTGreaterEqualStrategyNumber);
2196                         break;
2197                 default:
2198                         goto fail;                      /* shouldn't get here */
2199         }
2200
2201         if (!OidIsValid(lsortop) ||
2202                 !OidIsValid(rsortop) ||
2203                 !OidIsValid(leop) ||
2204                 !OidIsValid(revleop))
2205                 goto fail;                              /* insufficient info in catalogs */
2206
2207         /* Try to get maximum values of both inputs */
2208         if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
2209                 goto fail;                              /* no max available from stats */
2210
2211         if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
2212                 goto fail;                              /* no max available from stats */
2213
2214         /*
2215          * Now, the fraction of the left variable that will be scanned is the
2216          * fraction that's <= the right-side maximum value.  But only believe
2217          * non-default estimates, else stick with our 1.0.
2218          */
2219         selec = scalarineqsel(root, leop, false, &leftvar,
2220                                                   rightmax, op_righttype);
2221         if (selec != DEFAULT_INEQ_SEL)
2222                 *leftscan = selec;
2223
2224         /* And similarly for the right variable. */
2225         selec = scalarineqsel(root, revleop, false, &rightvar,
2226                                                   leftmax, op_lefttype);
2227         if (selec != DEFAULT_INEQ_SEL)
2228                 *rightscan = selec;
2229
2230         /*
2231          * Only one of the two fractions can really be less than 1.0; believe the
2232          * smaller estimate and reset the other one to exactly 1.0.  If we get
2233          * exactly equal estimates (as can easily happen with self-joins), believe
2234          * neither.
2235          */
2236         if (*leftscan > *rightscan)
2237                 *leftscan = 1.0;
2238         else if (*leftscan < *rightscan)
2239                 *rightscan = 1.0;
2240         else
2241                 *leftscan = *rightscan = 1.0;
2242
2243 fail:
2244         ReleaseVariableStats(leftvar);
2245         ReleaseVariableStats(rightvar);
2246 }
2247
2248
2249 /*
2250  * Helper routine for estimate_num_groups: add an item to a list of
2251  * GroupVarInfos, but only if it's not known equal to any of the existing
2252  * entries.
2253  */
2254 typedef struct
2255 {
2256         Node       *var;                        /* might be an expression, not just a Var */
2257         RelOptInfo *rel;                        /* relation it belongs to */
2258         double          ndistinct;              /* # distinct values */
2259 } GroupVarInfo;
2260
2261 static List *
2262 add_unique_group_var(PlannerInfo *root, List *varinfos,
2263                                          Node *var, VariableStatData *vardata)
2264 {
2265         GroupVarInfo *varinfo;
2266         double          ndistinct;
2267         ListCell   *lc;
2268
2269         ndistinct = get_variable_numdistinct(vardata);
2270
2271         /* cannot use foreach here because of possible list_delete */
2272         lc = list_head(varinfos);
2273         while (lc)
2274         {
2275                 varinfo = (GroupVarInfo *) lfirst(lc);
2276
2277                 /* must advance lc before list_delete possibly pfree's it */
2278                 lc = lnext(lc);
2279
2280                 /* Drop exact duplicates */
2281                 if (equal(var, varinfo->var))
2282                         return varinfos;
2283
2284                 /*
2285                  * Drop known-equal vars, but only if they belong to different
2286                  * relations (see comments for estimate_num_groups)
2287                  */
2288                 if (vardata->rel != varinfo->rel &&
2289                         exprs_known_equal(root, var, varinfo->var))
2290                 {
2291                         if (varinfo->ndistinct <= ndistinct)
2292                         {
2293                                 /* Keep older item, forget new one */
2294                                 return varinfos;
2295                         }
2296                         else
2297                         {
2298                                 /* Delete the older item */
2299                                 varinfos = list_delete_ptr(varinfos, varinfo);
2300                         }
2301                 }
2302         }
2303
2304         varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
2305
2306         varinfo->var = var;
2307         varinfo->rel = vardata->rel;
2308         varinfo->ndistinct = ndistinct;
2309         varinfos = lappend(varinfos, varinfo);
2310         return varinfos;
2311 }
2312
2313 /*
2314  * estimate_num_groups          - Estimate number of groups in a grouped query
2315  *
2316  * Given a query having a GROUP BY clause, estimate how many groups there
2317  * will be --- ie, the number of distinct combinations of the GROUP BY
2318  * expressions.
2319  *
2320  * This routine is also used to estimate the number of rows emitted by
2321  * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
2322  * actually, we only use it for DISTINCT when there's no grouping or
2323  * aggregation ahead of the DISTINCT.)
2324  *
2325  * Inputs:
2326  *      root - the query
2327  *      groupExprs - list of expressions being grouped by
2328  *      input_rows - number of rows estimated to arrive at the group/unique
2329  *              filter step
2330  *
2331  * Given the lack of any cross-correlation statistics in the system, it's
2332  * impossible to do anything really trustworthy with GROUP BY conditions
2333  * involving multiple Vars.  We should however avoid assuming the worst
2334  * case (all possible cross-product terms actually appear as groups) since
2335  * very often the grouped-by Vars are highly correlated.  Our current approach
2336  * is as follows:
2337  *      1.      Reduce the given expressions to a list of unique Vars used.  For
2338  *              example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
2339  *              It is clearly correct not to count the same Var more than once.
2340  *              It is also reasonable to treat f(x) the same as x: f() cannot
2341  *              increase the number of distinct values (unless it is volatile,
2342  *              which we consider unlikely for grouping), but it probably won't
2343  *              reduce the number of distinct values much either.
2344  *              As a special case, if a GROUP BY expression can be matched to an
2345  *              expressional index for which we have statistics, then we treat the
2346  *              whole expression as though it were just a Var.
2347  *      2.      If the list contains Vars of different relations that are known equal
2348  *              due to equijoin clauses, then drop all but one of the Vars from each
2349  *              known-equal set, keeping the one with smallest estimated # of values
2350  *              (since the extra values of the others can't appear in joined rows).
2351  *              Note the reason we only consider Vars of different relations is that
2352  *              if we considered ones of the same rel, we'd be double-counting the
2353  *              restriction selectivity of the equality in the next step.
2354  *      3.      For Vars within a single source rel, we multiply together the numbers
2355  *              of values, clamp to the number of rows in the rel (divided by 10 if
2356  *              more than one Var), and then multiply by the selectivity of the
2357  *              restriction clauses for that rel.  When there's more than one Var,
2358  *              the initial product is probably too high (it's the worst case) but
2359  *              clamping to a fraction of the rel's rows seems to be a helpful
2360  *              heuristic for not letting the estimate get out of hand.  (The factor
2361  *              of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
2362  *              by the restriction selectivity is effectively assuming that the
2363  *              restriction clauses are independent of the grouping, which is a crummy
2364  *              assumption, but it's hard to do better.
2365  *      4.      If there are Vars from multiple rels, we repeat step 3 for each such
2366  *              rel, and multiply the results together.
2367  * Note that rels not containing grouped Vars are ignored completely, as are
2368  * join clauses other than the equijoin clauses used in step 2.  Such rels
2369  * cannot increase the number of groups, and we assume such clauses do not
2370  * reduce the number either (somewhat bogus, but we don't have the info to
2371  * do better).
2372  */
2373 double
2374 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
2375 {
2376         List       *varinfos = NIL;
2377         double          numdistinct;
2378         ListCell   *l;
2379
2380         /* We should not be called unless query has GROUP BY (or DISTINCT) */
2381         Assert(groupExprs != NIL);
2382
2383         /*
2384          * Steps 1/2: find the unique Vars used, treating an expression as a Var
2385          * if we can find stats for it.  For each one, record the statistical
2386          * estimate of number of distinct values (total in its table, without
2387          * regard for filtering).
2388          */
2389         foreach(l, groupExprs)
2390         {
2391                 Node       *groupexpr = (Node *) lfirst(l);
2392                 VariableStatData vardata;
2393                 List       *varshere;
2394                 ListCell   *l2;
2395
2396                 /*
2397                  * If examine_variable is able to deduce anything about the GROUP BY
2398                  * expression, treat it as a single variable even if it's really more
2399                  * complicated.
2400                  */
2401                 examine_variable(root, groupexpr, 0, &vardata);
2402                 if (vardata.statsTuple != NULL || vardata.isunique)
2403                 {
2404                         varinfos = add_unique_group_var(root, varinfos,
2405                                                                                         groupexpr, &vardata);
2406                         ReleaseVariableStats(vardata);
2407                         continue;
2408                 }
2409                 ReleaseVariableStats(vardata);
2410
2411                 /*
2412                  * Else pull out the component Vars
2413                  */
2414                 varshere = pull_var_clause(groupexpr, false);
2415
2416                 /*
2417                  * If we find any variable-free GROUP BY item, then either it is a
2418                  * constant (and we can ignore it) or it contains a volatile function;
2419                  * in the latter case we punt and assume that each input row will
2420                  * yield a distinct group.
2421                  */
2422                 if (varshere == NIL)
2423                 {
2424                         if (contain_volatile_functions(groupexpr))
2425                                 return input_rows;
2426                         continue;
2427                 }
2428
2429                 /*
2430                  * Else add variables to varinfos list
2431                  */
2432                 foreach(l2, varshere)
2433                 {
2434                         Node       *var = (Node *) lfirst(l2);
2435
2436                         examine_variable(root, var, 0, &vardata);
2437                         varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2438                         ReleaseVariableStats(vardata);
2439                 }
2440         }
2441
2442         /* If now no Vars, we must have an all-constant GROUP BY list. */
2443         if (varinfos == NIL)
2444                 return 1.0;
2445
2446         /*
2447          * Steps 3/4: group Vars by relation and estimate total numdistinct.
2448          *
2449          * For each iteration of the outer loop, we process the frontmost Var in
2450          * varinfos, plus all other Vars in the same relation.  We remove these
2451          * Vars from the newvarinfos list for the next iteration. This is the
2452          * easiest way to group Vars of same rel together.
2453          */
2454         numdistinct = 1.0;
2455
2456         do
2457         {
2458                 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2459                 RelOptInfo *rel = varinfo1->rel;
2460                 double          reldistinct = varinfo1->ndistinct;
2461                 double          relmaxndistinct = reldistinct;
2462                 int                     relvarcount = 1;
2463                 List       *newvarinfos = NIL;
2464
2465                 /*
2466                  * Get the product of numdistinct estimates of the Vars for this rel.
2467                  * Also, construct new varinfos list of remaining Vars.
2468                  */
2469                 for_each_cell(l, lnext(list_head(varinfos)))
2470                 {
2471                         GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2472
2473                         if (varinfo2->rel == varinfo1->rel)
2474                         {
2475                                 reldistinct *= varinfo2->ndistinct;
2476                                 if (relmaxndistinct < varinfo2->ndistinct)
2477                                         relmaxndistinct = varinfo2->ndistinct;
2478                                 relvarcount++;
2479                         }
2480                         else
2481                         {
2482                                 /* not time to process varinfo2 yet */
2483                                 newvarinfos = lcons(varinfo2, newvarinfos);
2484                         }
2485                 }
2486
2487                 /*
2488                  * Sanity check --- don't divide by zero if empty relation.
2489                  */
2490                 Assert(rel->reloptkind == RELOPT_BASEREL);
2491                 if (rel->tuples > 0)
2492                 {
2493                         /*
2494                          * Clamp to size of rel, or size of rel / 10 if multiple Vars. The
2495                          * fudge factor is because the Vars are probably correlated but we
2496                          * don't know by how much.  We should never clamp to less than the
2497                          * largest ndistinct value for any of the Vars, though, since
2498                          * there will surely be at least that many groups.
2499                          */
2500                         double          clamp = rel->tuples;
2501
2502                         if (relvarcount > 1)
2503                         {
2504                                 clamp *= 0.1;
2505                                 if (clamp < relmaxndistinct)
2506                                 {
2507                                         clamp = relmaxndistinct;
2508                                         /* for sanity in case some ndistinct is too large: */
2509                                         if (clamp > rel->tuples)
2510                                                 clamp = rel->tuples;
2511                                 }
2512                         }
2513                         if (reldistinct > clamp)
2514                                 reldistinct = clamp;
2515
2516                         /*
2517                          * Multiply by restriction selectivity.
2518                          */
2519                         reldistinct *= rel->rows / rel->tuples;
2520
2521                         /*
2522                          * Update estimate of total distinct groups.
2523                          */
2524                         numdistinct *= reldistinct;
2525                 }
2526
2527                 varinfos = newvarinfos;
2528         } while (varinfos != NIL);
2529
2530         numdistinct = ceil(numdistinct);
2531
2532         /* Guard against out-of-range answers */
2533         if (numdistinct > input_rows)
2534                 numdistinct = input_rows;
2535         if (numdistinct < 1.0)
2536                 numdistinct = 1.0;
2537
2538         return numdistinct;
2539 }
2540
2541 /*
2542  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2543  * divided by total tuples in relation) if the specified expression is used
2544  * as a hash key.
2545  *
2546  * XXX This is really pretty bogus since we're effectively assuming that the
2547  * distribution of hash keys will be the same after applying restriction
2548  * clauses as it was in the underlying relation.  However, we are not nearly
2549  * smart enough to figure out how the restrict clauses might change the
2550  * distribution, so this will have to do for now.
2551  *
2552  * We are passed the number of buckets the executor will use for the given
2553  * input relation.      If the data were perfectly distributed, with the same
2554  * number of tuples going into each available bucket, then the bucketsize
2555  * fraction would be 1/nbuckets.  But this happy state of affairs will occur
2556  * only if (a) there are at least nbuckets distinct data values, and (b)
2557  * we have a not-too-skewed data distribution.  Otherwise the buckets will
2558  * be nonuniformly occupied.  If the other relation in the join has a key
2559  * distribution similar to this one's, then the most-loaded buckets are
2560  * exactly those that will be probed most often.  Therefore, the "average"
2561  * bucket size for costing purposes should really be taken as something close
2562  * to the "worst case" bucket size.  We try to estimate this by adjusting the
2563  * fraction if there are too few distinct data values, and then scaling up
2564  * by the ratio of the most common value's frequency to the average frequency.
2565  *
2566  * If no statistics are available, use a default estimate of 0.1.  This will
2567  * discourage use of a hash rather strongly if the inner relation is large,
2568  * which is what we want.  We do not want to hash unless we know that the
2569  * inner rel is well-dispersed (or the alternatives seem much worse).
2570  */
2571 Selectivity
2572 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2573 {
2574         VariableStatData vardata;
2575         double          estfract,
2576                                 ndistinct,
2577                                 stanullfrac,
2578                                 mcvfreq,
2579                                 avgfreq;
2580         float4     *numbers;
2581         int                     nnumbers;
2582
2583         examine_variable(root, hashkey, 0, &vardata);
2584
2585         /* Get number of distinct values and fraction that are null */
2586         ndistinct = get_variable_numdistinct(&vardata);
2587
2588         if (HeapTupleIsValid(vardata.statsTuple))
2589         {
2590                 Form_pg_statistic stats;
2591
2592                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2593                 stanullfrac = stats->stanullfrac;
2594         }
2595         else
2596         {
2597                 /*
2598                  * Believe a default ndistinct only if it came from stats. Otherwise
2599                  * punt and return 0.1, per comments above.
2600                  */
2601                 if (ndistinct == DEFAULT_NUM_DISTINCT)
2602                 {
2603                         ReleaseVariableStats(vardata);
2604                         return (Selectivity) 0.1;
2605                 }
2606
2607                 stanullfrac = 0.0;
2608         }
2609
2610         /* Compute avg freq of all distinct data values in raw relation */
2611         avgfreq = (1.0 - stanullfrac) / ndistinct;
2612
2613         /*
2614          * Adjust ndistinct to account for restriction clauses.  Observe we are
2615          * assuming that the data distribution is affected uniformly by the
2616          * restriction clauses!
2617          *
2618          * XXX Possibly better way, but much more expensive: multiply by
2619          * selectivity of rel's restriction clauses that mention the target Var.
2620          */
2621         if (vardata.rel)
2622                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2623
2624         /*
2625          * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
2626          * number of buckets is less than the expected number of distinct values;
2627          * otherwise it is 1/ndistinct.
2628          */
2629         if (ndistinct > nbuckets)
2630                 estfract = 1.0 / nbuckets;
2631         else
2632                 estfract = 1.0 / ndistinct;
2633
2634         /*
2635          * Look up the frequency of the most common value, if available.
2636          */
2637         mcvfreq = 0.0;
2638
2639         if (HeapTupleIsValid(vardata.statsTuple))
2640         {
2641                 if (get_attstatsslot(vardata.statsTuple,
2642                                                          vardata.atttype, vardata.atttypmod,
2643                                                          STATISTIC_KIND_MCV, InvalidOid,
2644                                                          NULL, NULL, &numbers, &nnumbers))
2645                 {
2646                         /*
2647                          * The first MCV stat is for the most common value.
2648                          */
2649                         if (nnumbers > 0)
2650                                 mcvfreq = numbers[0];
2651                         free_attstatsslot(vardata.atttype, NULL, 0,
2652                                                           numbers, nnumbers);
2653                 }
2654         }
2655
2656         /*
2657          * Adjust estimated bucketsize upward to account for skewed distribution.
2658          */
2659         if (avgfreq > 0.0 && mcvfreq > avgfreq)
2660                 estfract *= mcvfreq / avgfreq;
2661
2662         /*
2663          * Clamp bucketsize to sane range (the above adjustment could easily
2664          * produce an out-of-range result).  We set the lower bound a little above
2665          * zero, since zero isn't a very sane result.
2666          */
2667         if (estfract < 1.0e-6)
2668                 estfract = 1.0e-6;
2669         else if (estfract > 1.0)
2670                 estfract = 1.0;
2671
2672         ReleaseVariableStats(vardata);
2673
2674         return (Selectivity) estfract;
2675 }
2676
2677
2678 /*-------------------------------------------------------------------------
2679  *
2680  * Support routines
2681  *
2682  *-------------------------------------------------------------------------
2683  */
2684
2685 /*
2686  * convert_to_scalar
2687  *        Convert non-NULL values of the indicated types to the comparison
2688  *        scale needed by scalarineqsel().
2689  *        Returns "true" if successful.
2690  *
2691  * XXX this routine is a hack: ideally we should look up the conversion
2692  * subroutines in pg_type.
2693  *
2694  * All numeric datatypes are simply converted to their equivalent
2695  * "double" values.  (NUMERIC values that are outside the range of "double"
2696  * are clamped to +/- HUGE_VAL.)
2697  *
2698  * String datatypes are converted by convert_string_to_scalar(),
2699  * which is explained below.  The reason why this routine deals with
2700  * three values at a time, not just one, is that we need it for strings.
2701  *
2702  * The bytea datatype is just enough different from strings that it has
2703  * to be treated separately.
2704  *
2705  * The several datatypes representing absolute times are all converted
2706  * to Timestamp, which is actually a double, and then we just use that
2707  * double value.  Note this will give correct results even for the "special"
2708  * values of Timestamp, since those are chosen to compare correctly;
2709  * see timestamp_cmp.
2710  *
2711  * The several datatypes representing relative times (intervals) are all
2712  * converted to measurements expressed in seconds.
2713  */
2714 static bool
2715 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2716                                   Datum lobound, Datum hibound, Oid boundstypid,
2717                                   double *scaledlobound, double *scaledhibound)
2718 {
2719         /*
2720          * Both the valuetypid and the boundstypid should exactly match the
2721          * declared input type(s) of the operator we are invoked for, so we just
2722          * error out if either is not recognized.
2723          *
2724          * XXX The histogram we are interpolating between points of could belong
2725          * to a column that's only binary-compatible with the declared type. In
2726          * essence we are assuming that the semantics of binary-compatible types
2727          * are enough alike that we can use a histogram generated with one type's
2728          * operators to estimate selectivity for the other's.  This is outright
2729          * wrong in some cases --- in particular signed versus unsigned
2730          * interpretation could trip us up.  But it's useful enough in the
2731          * majority of cases that we do it anyway.      Should think about more
2732          * rigorous ways to do it.
2733          */
2734         switch (valuetypid)
2735         {
2736                         /*
2737                          * Built-in numeric types
2738                          */
2739                 case BOOLOID:
2740                 case INT2OID:
2741                 case INT4OID:
2742                 case INT8OID:
2743                 case FLOAT4OID:
2744                 case FLOAT8OID:
2745                 case NUMERICOID:
2746                 case OIDOID:
2747                 case REGPROCOID:
2748                 case REGPROCEDUREOID:
2749                 case REGOPEROID:
2750                 case REGOPERATOROID:
2751                 case REGCLASSOID:
2752                 case REGTYPEOID:
2753                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2754                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2755                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2756                         return true;
2757
2758                         /*
2759                          * Built-in string types
2760                          */
2761                 case CHAROID:
2762                 case BPCHAROID:
2763                 case VARCHAROID:
2764                 case TEXTOID:
2765                 case NAMEOID:
2766                         {
2767                                 char       *valstr = convert_string_datum(value, valuetypid);
2768                                 char       *lostr = convert_string_datum(lobound, boundstypid);
2769                                 char       *histr = convert_string_datum(hibound, boundstypid);
2770
2771                                 convert_string_to_scalar(valstr, scaledvalue,
2772                                                                                  lostr, scaledlobound,
2773                                                                                  histr, scaledhibound);
2774                                 pfree(valstr);
2775                                 pfree(lostr);
2776                                 pfree(histr);
2777                                 return true;
2778                         }
2779
2780                         /*
2781                          * Built-in bytea type
2782                          */
2783                 case BYTEAOID:
2784                         {
2785                                 convert_bytea_to_scalar(value, scaledvalue,
2786                                                                                 lobound, scaledlobound,
2787                                                                                 hibound, scaledhibound);
2788                                 return true;
2789                         }
2790
2791                         /*
2792                          * Built-in time types
2793                          */
2794                 case TIMESTAMPOID:
2795                 case TIMESTAMPTZOID:
2796                 case ABSTIMEOID:
2797                 case DATEOID:
2798                 case INTERVALOID:
2799                 case RELTIMEOID:
2800                 case TINTERVALOID:
2801                 case TIMEOID:
2802                 case TIMETZOID:
2803                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2804                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2805                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2806                         return true;
2807
2808                         /*
2809                          * Built-in network types
2810                          */
2811                 case INETOID:
2812                 case CIDROID:
2813                 case MACADDROID:
2814                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
2815                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2816                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2817                         return true;
2818         }
2819         /* Don't know how to convert */
2820         *scaledvalue = *scaledlobound = *scaledhibound = 0;
2821         return false;
2822 }
2823
2824 /*
2825  * Do convert_to_scalar()'s work for any numeric data type.
2826  */
2827 static double
2828 convert_numeric_to_scalar(Datum value, Oid typid)
2829 {
2830         switch (typid)
2831         {
2832                 case BOOLOID:
2833                         return (double) DatumGetBool(value);
2834                 case INT2OID:
2835                         return (double) DatumGetInt16(value);
2836                 case INT4OID:
2837                         return (double) DatumGetInt32(value);
2838                 case INT8OID:
2839                         return (double) DatumGetInt64(value);
2840                 case FLOAT4OID:
2841                         return (double) DatumGetFloat4(value);
2842                 case FLOAT8OID:
2843                         return (double) DatumGetFloat8(value);
2844                 case NUMERICOID:
2845                         /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2846                         return (double)
2847                                 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2848                                                                                                    value));
2849                 case OIDOID:
2850                 case REGPROCOID:
2851                 case REGPROCEDUREOID:
2852                 case REGOPEROID:
2853                 case REGOPERATOROID:
2854                 case REGCLASSOID:
2855                 case REGTYPEOID:
2856                         /* we can treat OIDs as integers... */
2857                         return (double) DatumGetObjectId(value);
2858         }
2859
2860         /*
2861          * Can't get here unless someone tries to use scalarltsel/scalargtsel on
2862          * an operator with one numeric and one non-numeric operand.
2863          */
2864         elog(ERROR, "unsupported type: %u", typid);
2865         return 0;
2866 }
2867
2868 /*
2869  * Do convert_to_scalar()'s work for any character-string data type.
2870  *
2871  * String datatypes are converted to a scale that ranges from 0 to 1,
2872  * where we visualize the bytes of the string as fractional digits.
2873  *
2874  * We do not want the base to be 256, however, since that tends to
2875  * generate inflated selectivity estimates; few databases will have
2876  * occurrences of all 256 possible byte values at each position.
2877  * Instead, use the smallest and largest byte values seen in the bounds
2878  * as the estimated range for each byte, after some fudging to deal with
2879  * the fact that we probably aren't going to see the full range that way.
2880  *
2881  * An additional refinement is that we discard any common prefix of the
2882  * three strings before computing the scaled values.  This allows us to
2883  * "zoom in" when we encounter a narrow data range.  An example is a phone
2884  * number database where all the values begin with the same area code.
2885  * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2886  * so this is more likely to happen than you might think.)
2887  */
2888 static void
2889 convert_string_to_scalar(char *value,
2890                                                  double *scaledvalue,
2891                                                  char *lobound,
2892                                                  double *scaledlobound,
2893                                                  char *hibound,
2894                                                  double *scaledhibound)
2895 {
2896         int                     rangelo,
2897                                 rangehi;
2898         char       *sptr;
2899
2900         rangelo = rangehi = (unsigned char) hibound[0];
2901         for (sptr = lobound; *sptr; sptr++)
2902         {
2903                 if (rangelo > (unsigned char) *sptr)
2904                         rangelo = (unsigned char) *sptr;
2905                 if (rangehi < (unsigned char) *sptr)
2906                         rangehi = (unsigned char) *sptr;
2907         }
2908         for (sptr = hibound; *sptr; sptr++)
2909         {
2910                 if (rangelo > (unsigned char) *sptr)
2911                         rangelo = (unsigned char) *sptr;
2912                 if (rangehi < (unsigned char) *sptr)
2913                         rangehi = (unsigned char) *sptr;
2914         }
2915         /* If range includes any upper-case ASCII chars, make it include all */
2916         if (rangelo <= 'Z' && rangehi >= 'A')
2917         {
2918                 if (rangelo > 'A')
2919                         rangelo = 'A';
2920                 if (rangehi < 'Z')
2921                         rangehi = 'Z';
2922         }
2923         /* Ditto lower-case */
2924         if (rangelo <= 'z' && rangehi >= 'a')
2925         {
2926                 if (rangelo > 'a')
2927                         rangelo = 'a';
2928                 if (rangehi < 'z')
2929                         rangehi = 'z';
2930         }
2931         /* Ditto digits */
2932         if (rangelo <= '9' && rangehi >= '0')
2933         {
2934                 if (rangelo > '0')
2935                         rangelo = '0';
2936                 if (rangehi < '9')
2937                         rangehi = '9';
2938         }
2939
2940         /*
2941          * If range includes less than 10 chars, assume we have not got enough
2942          * data, and make it include regular ASCII set.
2943          */
2944         if (rangehi - rangelo < 9)
2945         {
2946                 rangelo = ' ';
2947                 rangehi = 127;
2948         }
2949
2950         /*
2951          * Now strip any common prefix of the three strings.
2952          */
2953         while (*lobound)
2954         {
2955                 if (*lobound != *hibound || *lobound != *value)
2956                         break;
2957                 lobound++, hibound++, value++;
2958         }
2959
2960         /*
2961          * Now we can do the conversions.
2962          */
2963         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2964         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2965         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2966 }
2967
2968 static double
2969 convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
2970 {
2971         int                     slen = strlen(value);
2972         double          num,
2973                                 denom,
2974                                 base;
2975
2976         if (slen <= 0)
2977                 return 0.0;                             /* empty string has scalar value 0 */
2978
2979         /*
2980          * Since base is at least 10, need not consider more than about 20 chars
2981          */
2982         if (slen > 20)
2983                 slen = 20;
2984
2985         /* Convert initial characters to fraction */
2986         base = rangehi - rangelo + 1;
2987         num = 0.0;
2988         denom = base;
2989         while (slen-- > 0)
2990         {
2991                 int                     ch = (unsigned char) *value++;
2992
2993                 if (ch < rangelo)
2994                         ch = rangelo - 1;
2995                 else if (ch > rangehi)
2996                         ch = rangehi + 1;
2997                 num += ((double) (ch - rangelo)) / denom;
2998                 denom *= base;
2999         }
3000
3001         return num;
3002 }
3003
3004 /*
3005  * Convert a string-type Datum into a palloc'd, null-terminated string.
3006  *
3007  * When using a non-C locale, we must pass the string through strxfrm()
3008  * before continuing, so as to generate correct locale-specific results.
3009  */
3010 static char *
3011 convert_string_datum(Datum value, Oid typid)
3012 {
3013         char       *val;
3014
3015         switch (typid)
3016         {
3017                 case CHAROID:
3018                         val = (char *) palloc(2);
3019                         val[0] = DatumGetChar(value);
3020                         val[1] = '\0';
3021                         break;
3022                 case BPCHAROID:
3023                 case VARCHAROID:
3024                 case TEXTOID:
3025                         {
3026                                 char       *str = (char *) VARDATA(DatumGetPointer(value));
3027                                 int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
3028
3029                                 val = (char *) palloc(strlength + 1);
3030                                 memcpy(val, str, strlength);
3031                                 val[strlength] = '\0';
3032                                 break;
3033                         }
3034                 case NAMEOID:
3035                         {
3036                                 NameData   *nm = (NameData *) DatumGetPointer(value);
3037
3038                                 val = pstrdup(NameStr(*nm));
3039                                 break;
3040                         }
3041                 default:
3042
3043                         /*
3044                          * Can't get here unless someone tries to use scalarltsel on an
3045                          * operator with one string and one non-string operand.
3046                          */
3047                         elog(ERROR, "unsupported type: %u", typid);
3048                         return NULL;
3049         }
3050
3051         if (!lc_collate_is_c())
3052         {
3053                 char       *xfrmstr;
3054                 size_t          xfrmlen;
3055                 size_t          xfrmlen2;
3056
3057                 /*
3058                  * Note: originally we guessed at a suitable output buffer size, and
3059                  * only needed to call strxfrm twice if our guess was too small.
3060                  * However, it seems that some versions of Solaris have buggy strxfrm
3061                  * that can write past the specified buffer length in that scenario.
3062                  * So, do it the dumb way for portability.
3063                  *
3064                  * Yet other systems (e.g., glibc) sometimes return a smaller value
3065                  * from the second call than the first; thus the Assert must be <= not
3066                  * == as you'd expect.  Can't any of these people program their way
3067                  * out of a paper bag?
3068                  */
3069 #if _MSC_VER == 1400                    /* VS.Net 2005 */
3070
3071                 /*
3072                  * http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx
3073                  * ?FeedbackID=99694
3074                  */
3075                 {
3076                         char            x[1];
3077
3078                         xfrmlen = strxfrm(x, val, 0);
3079                 }
3080 #else
3081                 xfrmlen = strxfrm(NULL, val, 0);
3082 #endif
3083                 xfrmstr = (char *) palloc(xfrmlen + 1);
3084                 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
3085                 Assert(xfrmlen2 <= xfrmlen);
3086                 pfree(val);
3087                 val = xfrmstr;
3088         }
3089
3090         return val;
3091 }
3092
3093 /*
3094  * Do convert_to_scalar()'s work for any bytea data type.
3095  *
3096  * Very similar to convert_string_to_scalar except we can't assume
3097  * null-termination and therefore pass explicit lengths around.
3098  *
3099  * Also, assumptions about likely "normal" ranges of characters have been
3100  * removed - a data range of 0..255 is always used, for now.  (Perhaps
3101  * someday we will add information about actual byte data range to
3102  * pg_statistic.)
3103  */
3104 static void
3105 convert_bytea_to_scalar(Datum value,
3106                                                 double *scaledvalue,
3107                                                 Datum lobound,
3108                                                 double *scaledlobound,
3109                                                 Datum hibound,
3110                                                 double *scaledhibound)
3111 {
3112         int                     rangelo,
3113                                 rangehi,
3114                                 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
3115                                 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
3116                                 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
3117                                 i,
3118                                 minlen;
3119         unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
3120                            *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
3121                            *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
3122
3123         /*
3124          * Assume bytea data is uniformly distributed across all byte values.
3125          */
3126         rangelo = 0;
3127         rangehi = 255;
3128
3129         /*
3130          * Now strip any common prefix of the three strings.
3131          */
3132         minlen = Min(Min(valuelen, loboundlen), hiboundlen);
3133         for (i = 0; i < minlen; i++)
3134         {
3135                 if (*lostr != *histr || *lostr != *valstr)
3136                         break;
3137                 lostr++, histr++, valstr++;
3138                 loboundlen--, hiboundlen--, valuelen--;
3139         }
3140
3141         /*
3142          * Now we can do the conversions.
3143          */
3144         *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
3145         *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
3146         *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
3147 }
3148
3149 static double
3150 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
3151                                                         int rangelo, int rangehi)
3152 {
3153         double          num,
3154                                 denom,
3155                                 base;
3156
3157         if (valuelen <= 0)
3158                 return 0.0;                             /* empty string has scalar value 0 */
3159
3160         /*
3161          * Since base is 256, need not consider more than about 10 chars (even
3162          * this many seems like overkill)
3163          */
3164         if (valuelen > 10)
3165                 valuelen = 10;
3166
3167         /* Convert initial characters to fraction */
3168         base = rangehi - rangelo + 1;
3169         num = 0.0;
3170         denom = base;
3171         while (valuelen-- > 0)
3172         {
3173                 int                     ch = *value++;
3174
3175                 if (ch < rangelo)
3176                         ch = rangelo - 1;
3177                 else if (ch > rangehi)
3178                         ch = rangehi + 1;
3179                 num += ((double) (ch - rangelo)) / denom;
3180                 denom *= base;
3181         }
3182
3183         return num;
3184 }
3185
3186 /*
3187  * Do convert_to_scalar()'s work for any timevalue data type.
3188  */
3189 static double
3190 convert_timevalue_to_scalar(Datum value, Oid typid)
3191 {
3192         switch (typid)
3193         {
3194                 case TIMESTAMPOID:
3195                         return DatumGetTimestamp(value);
3196                 case TIMESTAMPTZOID:
3197                         return DatumGetTimestampTz(value);
3198                 case ABSTIMEOID:
3199                         return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
3200                                                                                                                  value));
3201                 case DATEOID:
3202                         return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
3203                                                                                                                  value));
3204                 case INTERVALOID:
3205                         {
3206                                 Interval   *interval = DatumGetIntervalP(value);
3207
3208                                 /*
3209                                  * Convert the month part of Interval to days using assumed
3210                                  * average month length of 365.25/12.0 days.  Not too
3211                                  * accurate, but plenty good enough for our purposes.
3212                                  */
3213 #ifdef HAVE_INT64_TIMESTAMP
3214                                 return interval->time + interval->day * (double) USECS_PER_DAY +
3215                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
3216 #else
3217                                 return interval->time + interval->day * SECS_PER_DAY +
3218                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
3219 #endif
3220                         }
3221                 case RELTIMEOID:
3222 #ifdef HAVE_INT64_TIMESTAMP
3223                         return (DatumGetRelativeTime(value) * 1000000.0);
3224 #else
3225                         return DatumGetRelativeTime(value);
3226 #endif
3227                 case TINTERVALOID:
3228                         {
3229                                 TimeInterval tinterval = DatumGetTimeInterval(value);
3230
3231 #ifdef HAVE_INT64_TIMESTAMP
3232                                 if (tinterval->status != 0)
3233                                         return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
3234 #else
3235                                 if (tinterval->status != 0)
3236                                         return tinterval->data[1] - tinterval->data[0];
3237 #endif
3238                                 return 0;               /* for lack of a better idea */
3239                         }
3240                 case TIMEOID:
3241                         return DatumGetTimeADT(value);
3242                 case TIMETZOID:
3243                         {
3244                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
3245
3246                                 /* use GMT-equivalent time */
3247 #ifdef HAVE_INT64_TIMESTAMP
3248                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
3249 #else
3250                                 return (double) (timetz->time + timetz->zone);
3251 #endif
3252                         }
3253         }
3254
3255         /*
3256          * Can't get here unless someone tries to use scalarltsel/scalargtsel on
3257          * an operator with one timevalue and one non-timevalue operand.
3258          */
3259         elog(ERROR, "unsupported type: %u", typid);
3260         return 0;
3261 }
3262
3263
3264 /*
3265  * get_restriction_variable
3266  *              Examine the args of a restriction clause to see if it's of the
3267  *              form (variable op pseudoconstant) or (pseudoconstant op variable),
3268  *              where "variable" could be either a Var or an expression in vars of a
3269  *              single relation.  If so, extract information about the variable,
3270  *              and also indicate which side it was on and the other argument.
3271  *
3272  * Inputs:
3273  *      root: the planner info
3274  *      args: clause argument list
3275  *      varRelid: see specs for restriction selectivity functions
3276  *
3277  * Outputs: (these are valid only if TRUE is returned)
3278  *      *vardata: gets information about variable (see examine_variable)
3279  *      *other: gets other clause argument, aggressively reduced to a constant
3280  *      *varonleft: set TRUE if variable is on the left, FALSE if on the right
3281  *
3282  * Returns TRUE if a variable is identified, otherwise FALSE.
3283  *
3284  * Note: if there are Vars on both sides of the clause, we must fail, because
3285  * callers are expecting that the other side will act like a pseudoconstant.
3286  */
3287 bool
3288 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
3289                                                  VariableStatData *vardata, Node **other,
3290                                                  bool *varonleft)
3291 {
3292         Node       *left,
3293                            *right;
3294         VariableStatData rdata;
3295
3296         /* Fail if not a binary opclause (probably shouldn't happen) */
3297         if (list_length(args) != 2)
3298                 return false;
3299
3300         left = (Node *) linitial(args);
3301         right = (Node *) lsecond(args);
3302
3303         /*
3304          * Examine both sides.  Note that when varRelid is nonzero, Vars of other
3305          * relations will be treated as pseudoconstants.
3306          */
3307         examine_variable(root, left, varRelid, vardata);
3308         examine_variable(root, right, varRelid, &rdata);
3309
3310         /*
3311          * If one side is a variable and the other not, we win.
3312          */
3313         if (vardata->rel && rdata.rel == NULL)
3314         {
3315                 *varonleft = true;
3316                 *other = estimate_expression_value(rdata.var);
3317                 /* Assume we need no ReleaseVariableStats(rdata) here */
3318                 return true;
3319         }
3320
3321         if (vardata->rel == NULL && rdata.rel)
3322         {
3323                 *varonleft = false;
3324                 *other = estimate_expression_value(vardata->var);
3325                 /* Assume we need no ReleaseVariableStats(*vardata) here */
3326                 *vardata = rdata;
3327                 return true;
3328         }
3329
3330         /* Ooops, clause has wrong structure (probably var op var) */
3331         ReleaseVariableStats(*vardata);
3332         ReleaseVariableStats(rdata);
3333
3334         return false;
3335 }
3336
3337 /*
3338  * get_join_variables
3339  *              Apply examine_variable() to each side of a join clause.
3340  */
3341 void
3342 get_join_variables(PlannerInfo *root, List *args,
3343                                    VariableStatData *vardata1, VariableStatData *vardata2)
3344 {
3345         Node       *left,
3346                            *right;
3347
3348         if (list_length(args) != 2)
3349                 elog(ERROR, "join operator should take two arguments");
3350
3351         left = (Node *) linitial(args);
3352         right = (Node *) lsecond(args);
3353
3354         examine_variable(root, left, 0, vardata1);
3355         examine_variable(root, right, 0, vardata2);
3356 }
3357
3358 /*
3359  * examine_variable
3360  *              Try to look up statistical data about an expression.
3361  *              Fill in a VariableStatData struct to describe the expression.
3362  *
3363  * Inputs:
3364  *      root: the planner info
3365  *      node: the expression tree to examine
3366  *      varRelid: see specs for restriction selectivity functions
3367  *
3368  * Outputs: *vardata is filled as follows:
3369  *      var: the input expression (with any binary relabeling stripped, if
3370  *              it is or contains a variable; but otherwise the type is preserved)
3371  *      rel: RelOptInfo for relation containing variable; NULL if expression
3372  *              contains no Vars (NOTE this could point to a RelOptInfo of a
3373  *              subquery, not one in the current query).
3374  *      statsTuple: the pg_statistic entry for the variable, if one exists;
3375  *              otherwise NULL.
3376  *      vartype: exposed type of the expression; this should always match
3377  *              the declared input type of the operator we are estimating for.
3378  *      atttype, atttypmod: type data to pass to get_attstatsslot().  This is
3379  *              commonly the same as the exposed type of the variable argument,
3380  *              but can be different in binary-compatible-type cases.
3381  *
3382  * Caller is responsible for doing ReleaseVariableStats() before exiting.
3383  */
3384 void
3385 examine_variable(PlannerInfo *root, Node *node, int varRelid,
3386                                  VariableStatData *vardata)
3387 {
3388         Node       *basenode;
3389         Relids          varnos;
3390         RelOptInfo *onerel;
3391
3392         /* Make sure we don't return dangling pointers in vardata */
3393         MemSet(vardata, 0, sizeof(VariableStatData));
3394
3395         /* Save the exposed type of the expression */
3396         vardata->vartype = exprType(node);
3397
3398         /* Look inside any binary-compatible relabeling */
3399
3400         if (IsA(node, RelabelType))
3401                 basenode = (Node *) ((RelabelType *) node)->arg;
3402         else
3403                 basenode = node;
3404
3405         /* Fast path for a simple Var */
3406
3407         if (IsA(basenode, Var) &&
3408                 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
3409         {
3410                 Var                *var = (Var *) basenode;
3411                 RangeTblEntry *rte;
3412
3413                 vardata->var = basenode;        /* return Var without relabeling */
3414                 vardata->rel = find_base_rel(root, var->varno);
3415                 vardata->atttype = var->vartype;
3416                 vardata->atttypmod = var->vartypmod;
3417
3418                 rte = rt_fetch(var->varno, root->parse->rtable);
3419
3420                 if (rte->inh)
3421                 {
3422                         /*
3423                          * XXX This means the Var represents a column of an append
3424                          * relation. Later add code to look at the member relations and
3425                          * try to derive some kind of combined statistics?
3426                          */
3427                 }
3428                 else if (rte->rtekind == RTE_RELATION)
3429                 {
3430                         vardata->statsTuple = SearchSysCache(STATRELATT,
3431                                                                                                  ObjectIdGetDatum(rte->relid),
3432                                                                                                  Int16GetDatum(var->varattno),
3433                                                                                                  0, 0);
3434                 }
3435                 else
3436                 {
3437                         /*
3438                          * XXX This means the Var comes from a JOIN or sub-SELECT. Later
3439                          * add code to dig down into the join etc and see if we can trace
3440                          * the variable to something with stats.  (But beware of
3441                          * sub-SELECTs with DISTINCT/GROUP BY/etc.      Perhaps there are no
3442                          * cases where this would really be useful, because we'd have
3443                          * flattened the subselect if it is??)
3444                          */
3445                 }
3446
3447                 return;
3448         }
3449
3450         /*
3451          * Okay, it's a more complicated expression.  Determine variable
3452          * membership.  Note that when varRelid isn't zero, only vars of that
3453          * relation are considered "real" vars.
3454          */
3455         varnos = pull_varnos(basenode);
3456
3457         onerel = NULL;
3458
3459         switch (bms_membership(varnos))
3460         {
3461                 case BMS_EMPTY_SET:
3462                         /* No Vars at all ... must be pseudo-constant clause */
3463                         break;
3464                 case BMS_SINGLETON:
3465                         if (varRelid == 0 || bms_is_member(varRelid, varnos))
3466                         {
3467                                 onerel = find_base_rel(root,
3468                                            (varRelid ? varRelid : bms_singleton_member(varnos)));
3469                                 vardata->rel = onerel;
3470                                 node = basenode;        /* strip any relabeling */
3471                         }
3472                         /* else treat it as a constant */
3473                         break;
3474                 case BMS_MULTIPLE:
3475                         if (varRelid == 0)
3476                         {
3477                                 /* treat it as a variable of a join relation */
3478                                 vardata->rel = find_join_rel(root, varnos);
3479                                 node = basenode;        /* strip any relabeling */
3480                         }
3481                         else if (bms_is_member(varRelid, varnos))
3482                         {
3483                                 /* ignore the vars belonging to other relations */
3484                                 vardata->rel = find_base_rel(root, varRelid);
3485                                 node = basenode;        /* strip any relabeling */
3486                                 /* note: no point in expressional-index search here */
3487                         }
3488                         /* else treat it as a constant */
3489                         break;
3490         }
3491
3492         bms_free(varnos);
3493
3494         vardata->var = node;
3495         vardata->atttype = exprType(node);
3496         vardata->atttypmod = exprTypmod(node);
3497
3498         if (onerel)
3499         {
3500                 /*
3501                  * We have an expression in vars of a single relation.  Try to match
3502                  * it to expressional index columns, in hopes of finding some
3503                  * statistics.
3504                  *
3505                  * XXX it's conceivable that there are multiple matches with different
3506                  * index opfamilies; if so, we need to pick one that matches the
3507                  * operator we are estimating for.      FIXME later.
3508                  */
3509                 ListCell   *ilist;
3510
3511                 foreach(ilist, onerel->indexlist)
3512                 {
3513                         IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3514                         ListCell   *indexpr_item;
3515                         int                     pos;
3516
3517                         indexpr_item = list_head(index->indexprs);
3518                         if (indexpr_item == NULL)
3519                                 continue;               /* no expressions here... */
3520
3521                         /*
3522                          * Ignore partial indexes since they probably don't reflect
3523                          * whole-relation statistics.  Possibly reconsider this later.
3524                          */
3525                         if (index->indpred)
3526                                 continue;
3527
3528                         for (pos = 0; pos < index->ncolumns; pos++)
3529                         {
3530                                 if (index->indexkeys[pos] == 0)
3531                                 {
3532                                         Node       *indexkey;
3533
3534                                         if (indexpr_item == NULL)
3535                                                 elog(ERROR, "too few entries in indexprs list");
3536                                         indexkey = (Node *) lfirst(indexpr_item);
3537                                         if (indexkey && IsA(indexkey, RelabelType))
3538                                                 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3539                                         if (equal(node, indexkey))
3540                                         {
3541                                                 /*
3542                                                  * Found a match ... is it a unique index? Tests here
3543                                                  * should match has_unique_index().
3544                                                  */
3545                                                 if (index->unique &&
3546                                                         index->ncolumns == 1 &&
3547                                                         index->indpred == NIL)
3548                                                         vardata->isunique = true;
3549                                                 /* Has it got stats? */
3550                                                 vardata->statsTuple = SearchSysCache(STATRELATT,
3551                                                                                    ObjectIdGetDatum(index->indexoid),
3552                                                                                                           Int16GetDatum(pos + 1),
3553                                                                                                                          0, 0);
3554                                                 if (vardata->statsTuple)
3555                                                         break;
3556                                         }
3557                                         indexpr_item = lnext(indexpr_item);
3558                                 }
3559                         }
3560                         if (vardata->statsTuple)
3561                                 break;
3562                 }
3563         }
3564 }
3565
3566 /*
3567  * get_variable_numdistinct
3568  *        Estimate the number of distinct values of a variable.
3569  *
3570  * vardata: results of examine_variable
3571  *
3572  * NB: be careful to produce an integral result, since callers may compare
3573  * the result to exact integer counts.
3574  */
3575 double
3576 get_variable_numdistinct(VariableStatData *vardata)
3577 {
3578         double          stadistinct;
3579         double          ntuples;
3580
3581         /*
3582          * Determine the stadistinct value to use.      There are cases where we can
3583          * get an estimate even without a pg_statistic entry, or can get a better
3584          * value than is in pg_statistic.
3585          */
3586         if (HeapTupleIsValid(vardata->statsTuple))
3587         {
3588                 /* Use the pg_statistic entry */
3589                 Form_pg_statistic stats;
3590
3591                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3592                 stadistinct = stats->stadistinct;
3593         }
3594         else if (vardata->vartype == BOOLOID)
3595         {
3596                 /*
3597                  * Special-case boolean columns: presumably, two distinct values.
3598                  *
3599                  * Are there any other datatypes we should wire in special estimates
3600                  * for?
3601                  */
3602                 stadistinct = 2.0;
3603         }
3604         else
3605         {
3606                 /*
3607                  * We don't keep statistics for system columns, but in some cases we
3608                  * can infer distinctness anyway.
3609                  */
3610                 if (vardata->var && IsA(vardata->var, Var))
3611                 {
3612                         switch (((Var *) vardata->var)->varattno)
3613                         {
3614                                 case ObjectIdAttributeNumber:
3615                                 case SelfItemPointerAttributeNumber:
3616                                         stadistinct = -1.0; /* unique */
3617                                         break;
3618                                 case TableOidAttributeNumber:
3619                                         stadistinct = 1.0;      /* only 1 value */
3620                                         break;
3621                                 default:
3622                                         stadistinct = 0.0;      /* means "unknown" */
3623                                         break;
3624                         }
3625                 }
3626                 else
3627                         stadistinct = 0.0;      /* means "unknown" */
3628
3629                 /*
3630                  * XXX consider using estimate_num_groups on expressions?
3631                  */
3632         }
3633
3634         /*
3635          * If there is a unique index for the variable, assume it is unique no
3636          * matter what pg_statistic says (the statistics could be out of date).
3637          * Can skip search if we already think it's unique.
3638          */
3639         if (stadistinct != -1.0)
3640         {
3641                 if (vardata->isunique)
3642                         stadistinct = -1.0;
3643                 else if (vardata->var && IsA(vardata->var, Var) &&
3644                                  vardata->rel &&
3645                                  has_unique_index(vardata->rel,
3646                                                                   ((Var *) vardata->var)->varattno))
3647                         stadistinct = -1.0;
3648         }
3649
3650         /*
3651          * If we had an absolute estimate, use that.
3652          */
3653         if (stadistinct > 0.0)
3654                 return stadistinct;
3655
3656         /*
3657          * Otherwise we need to get the relation size; punt if not available.
3658          */
3659         if (vardata->rel == NULL)
3660                 return DEFAULT_NUM_DISTINCT;
3661         ntuples = vardata->rel->tuples;
3662         if (ntuples <= 0.0)
3663                 return DEFAULT_NUM_DISTINCT;
3664
3665         /*
3666          * If we had a relative estimate, use that.
3667          */
3668         if (stadistinct < 0.0)
3669                 return floor((-stadistinct * ntuples) + 0.5);
3670
3671         /*
3672          * With no data, estimate ndistinct = ntuples if the table is small, else
3673          * use default.
3674          */
3675         if (ntuples < DEFAULT_NUM_DISTINCT)
3676                 return ntuples;
3677
3678         return DEFAULT_NUM_DISTINCT;
3679 }
3680
3681 /*
3682  * get_variable_maximum
3683  *              Estimate the maximum value of the specified variable.
3684  *              If successful, store value in *max and return TRUE.
3685  *              If no data available, return FALSE.
3686  *
3687  * sortop is the "<" comparison operator to use.  (To extract the
3688  * minimum instead of the maximum, just pass the ">" operator instead.)
3689  */
3690 static bool
3691 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3692                                          Oid sortop, Datum *max)
3693 {
3694         Datum           tmax = 0;
3695         bool            have_max = false;
3696         Form_pg_statistic stats;
3697         int16           typLen;
3698         bool            typByVal;
3699         Datum      *values;
3700         int                     nvalues;
3701         int                     i;
3702
3703         if (!HeapTupleIsValid(vardata->statsTuple))
3704         {
3705                 /* no stats available, so default result */
3706                 return false;
3707         }
3708         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3709
3710         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3711
3712         /*
3713          * If there is a histogram, grab the last or first value as appropriate.
3714          *
3715          * If there is a histogram that is sorted with some other operator than
3716          * the one we want, fail --- this suggests that there is data we can't
3717          * use.
3718          */
3719         if (get_attstatsslot(vardata->statsTuple,
3720                                                  vardata->atttype, vardata->atttypmod,
3721                                                  STATISTIC_KIND_HISTOGRAM, sortop,
3722                                                  &values, &nvalues,
3723                                                  NULL, NULL))
3724         {
3725                 if (nvalues > 0)
3726                 {
3727                         tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3728                         have_max = true;
3729                 }
3730                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3731         }
3732         else
3733         {
3734                 Oid                     rsortop = get_commutator(sortop);
3735
3736                 if (OidIsValid(rsortop) &&
3737                         get_attstatsslot(vardata->statsTuple,
3738                                                          vardata->atttype, vardata->atttypmod,
3739                                                          STATISTIC_KIND_HISTOGRAM, rsortop,
3740                                                          &values, &nvalues,
3741                                                          NULL, NULL))
3742                 {
3743                         if (nvalues > 0)
3744                         {
3745                                 tmax = datumCopy(values[0], typByVal, typLen);
3746                                 have_max = true;
3747                         }
3748                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3749                 }
3750                 else if (get_attstatsslot(vardata->statsTuple,
3751                                                                   vardata->atttype, vardata->atttypmod,
3752                                                                   STATISTIC_KIND_HISTOGRAM, InvalidOid,
3753                                                                   &values, &nvalues,
3754                                                                   NULL, NULL))
3755                 {
3756                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3757                         return false;
3758                 }
3759         }
3760
3761         /*
3762          * If we have most-common-values info, look for a large MCV.  This is
3763          * needed even if we also have a histogram, since the histogram excludes
3764          * the MCVs.  However, usually the MCVs will not be the extreme values, so
3765          * avoid unnecessary data copying.
3766          */
3767         if (get_attstatsslot(vardata->statsTuple,
3768                                                  vardata->atttype, vardata->atttypmod,
3769                                                  STATISTIC_KIND_MCV, InvalidOid,
3770                                                  &values, &nvalues,
3771                                                  NULL, NULL))
3772         {
3773                 bool            large_mcv = false;
3774                 FmgrInfo        opproc;
3775
3776                 fmgr_info(get_opcode(sortop), &opproc);
3777
3778                 for (i = 0; i < nvalues; i++)
3779                 {
3780                         if (!have_max)
3781                         {
3782                                 tmax = values[i];
3783                                 large_mcv = have_max = true;
3784                         }
3785                         else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3786                         {
3787                                 tmax = values[i];
3788                                 large_mcv = true;
3789                         }
3790                 }
3791                 if (large_mcv)
3792                         tmax = datumCopy(tmax, typByVal, typLen);
3793                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3794         }
3795
3796         *max = tmax;
3797         return have_max;
3798 }
3799
3800
3801 /*-------------------------------------------------------------------------
3802  *
3803  * Pattern analysis functions
3804  *
3805  * These routines support analysis of LIKE and regular-expression patterns
3806  * by the planner/optimizer.  It's important that they agree with the
3807  * regular-expression code in backend/regex/ and the LIKE code in
3808  * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
3809  * must be conservative: if we report a string longer than the true fixed
3810  * prefix, the query may produce actually wrong answers, rather than just
3811  * getting a bad selectivity estimate!
3812  *
3813  * Note that the prefix-analysis functions are called from
3814  * backend/optimizer/path/indxpath.c as well as from routines in this file.
3815  *
3816  *-------------------------------------------------------------------------
3817  */
3818
3819 /*
3820  * Extract the fixed prefix, if any, for a pattern.
3821  *
3822  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3823  *      or to NULL if no fixed prefix exists for the pattern.
3824  * *rest is set to a palloc'd Const representing the remainder of the pattern
3825  *      after the portion describing the fixed prefix.
3826  * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3827  *
3828  * The return value distinguishes no fixed prefix, a partial prefix,
3829  * or an exact-match-only pattern.
3830  */
3831
3832 static Pattern_Prefix_Status
3833 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3834                                   Const **prefix_const, Const **rest_const)
3835 {
3836         char       *match;
3837         char       *patt;
3838         int                     pattlen;
3839         char       *rest;
3840         Oid                     typeid = patt_const->consttype;
3841         int                     pos,
3842                                 match_pos;
3843         bool            is_multibyte = (pg_database_encoding_max_length() > 1);
3844
3845         /* the right-hand const is type text or bytea */
3846         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3847
3848         if (typeid == BYTEAOID && case_insensitive)
3849                 ereport(ERROR,
3850                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3851                    errmsg("case insensitive matching not supported on type bytea")));
3852
3853         if (typeid != BYTEAOID)
3854         {
3855                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3856                 pattlen = strlen(patt);
3857         }
3858         else
3859         {
3860                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
3861
3862                 pattlen = VARSIZE(bstr) - VARHDRSZ;
3863                 patt = (char *) palloc(pattlen);
3864                 memcpy(patt, VARDATA(bstr), pattlen);
3865                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3866                         pfree(bstr);
3867         }
3868
3869         match = palloc(pattlen + 1);
3870         match_pos = 0;
3871         for (pos = 0; pos < pattlen; pos++)
3872         {
3873                 /* % and _ are wildcard characters in LIKE */
3874                 if (patt[pos] == '%' ||
3875                         patt[pos] == '_')
3876                         break;
3877
3878                 /* Backslash escapes the next character */
3879                 if (patt[pos] == '\\')
3880                 {
3881                         pos++;
3882                         if (pos >= pattlen)
3883                                 break;
3884                 }
3885
3886                 /*
3887                  * XXX In multibyte character sets, we can't trust isalpha, so assume
3888                  * any multibyte char is potentially case-varying.
3889                  */
3890                 if (case_insensitive)
3891                 {
3892                         if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
3893                                 break;
3894                         if (isalpha((unsigned char) patt[pos]))
3895                                 break;
3896                 }
3897
3898                 /*
3899                  * NOTE: this code used to think that %% meant a literal %, but
3900                  * textlike() itself does not think that, and the SQL92 spec doesn't
3901                  * say any such thing either.
3902                  */
3903                 match[match_pos++] = patt[pos];
3904         }
3905
3906         match[match_pos] = '\0';
3907         rest = &patt[pos];
3908
3909         if (typeid != BYTEAOID)
3910         {
3911                 *prefix_const = string_to_const(match, typeid);
3912                 *rest_const = string_to_const(rest, typeid);
3913         }
3914         else
3915         {
3916                 *prefix_const = string_to_bytea_const(match, match_pos);
3917                 *rest_const = string_to_bytea_const(rest, pattlen - pos);
3918         }
3919
3920         pfree(patt);
3921         pfree(match);
3922
3923         /* in LIKE, an empty pattern is an exact match! */
3924         if (pos == pattlen)
3925                 return Pattern_Prefix_Exact;    /* reached end of pattern, so exact */
3926
3927         if (match_pos > 0)
3928                 return Pattern_Prefix_Partial;
3929
3930         return Pattern_Prefix_None;
3931 }
3932
3933 static Pattern_Prefix_Status
3934 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3935                                    Const **prefix_const, Const **rest_const)
3936 {
3937         char       *match;
3938         int                     pos,
3939                                 match_pos,
3940                                 prev_pos,
3941                                 prev_match_pos;
3942         bool            have_leading_paren;
3943         char       *patt;
3944         char       *rest;
3945         Oid                     typeid = patt_const->consttype;
3946         bool            is_basic = regex_flavor_is_basic();
3947         bool            is_multibyte = (pg_database_encoding_max_length() > 1);
3948
3949         /*
3950          * Should be unnecessary, there are no bytea regex operators defined. As
3951          * such, it should be noted that the rest of this function has *not* been
3952          * made safe for binary (possibly NULL containing) strings.
3953          */
3954         if (typeid == BYTEAOID)
3955                 ereport(ERROR,
3956                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3957                  errmsg("regular-expression matching not supported on type bytea")));
3958
3959         /* the right-hand const is type text for all of these */
3960         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3961
3962         /*
3963          * Check for ARE director prefix.  It's worth our trouble to recognize
3964          * this because similar_escape() uses it.
3965          */
3966         pos = 0;
3967         if (strncmp(patt, "***:", 4) == 0)
3968         {
3969                 pos = 4;
3970                 is_basic = false;
3971         }
3972
3973         /* Pattern must be anchored left */
3974         if (patt[pos] != '^')
3975         {
3976                 rest = patt;
3977
3978                 *prefix_const = NULL;
3979                 *rest_const = string_to_const(rest, typeid);
3980
3981                 return Pattern_Prefix_None;
3982         }
3983         pos++;
3984
3985         /*
3986          * If '|' is present in pattern, then there may be multiple alternatives
3987          * for the start of the string.  (There are cases where this isn't so,
3988          * for instance if the '|' is inside parens, but detecting that reliably
3989          * is too hard.)
3990          */
3991         if (strchr(patt + pos, '|') != NULL)
3992         {
3993                 rest = patt;
3994
3995                 *prefix_const = NULL;
3996                 *rest_const = string_to_const(rest, typeid);
3997
3998                 return Pattern_Prefix_None;
3999         }
4000
4001         /* OK, allocate space for pattern */
4002         match = palloc(strlen(patt) + 1);
4003         prev_match_pos = match_pos = 0;
4004
4005         /*
4006          * We special-case the syntax '^(...)$' because psql uses it.  But beware:
4007          * in BRE mode these parentheses are just ordinary characters.  Also,
4008          * sequences beginning "(?" are not what they seem, unless they're "(?:".
4009          * (We should recognize that, too, because of similar_escape().)
4010          *
4011          * Note: it's a bit bogus to be depending on the current regex_flavor
4012          * setting here, because the setting could change before the pattern is
4013          * used.  We minimize the risk by trusting the flavor as little as we can,
4014          * but perhaps it would be a good idea to get rid of the "basic" setting.
4015          */
4016         have_leading_paren = false;
4017         if (patt[pos] == '(' && !is_basic &&
4018                 (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
4019         {
4020                 have_leading_paren = true;
4021                 pos += (patt[pos + 1] != '?' ? 1 : 3);
4022         }
4023
4024         /* Scan remainder of pattern */
4025         prev_pos = pos;
4026         while (patt[pos])
4027         {
4028                 int                     len;
4029
4030                 /*
4031                  * Check for characters that indicate multiple possible matches here.
4032                  * Also, drop out at ')' or '$' so the termination test works right.
4033                  */
4034                 if (patt[pos] == '.' ||
4035                         patt[pos] == '(' ||
4036                         patt[pos] == ')' ||
4037                         patt[pos] == '[' ||
4038                         patt[pos] == '^' ||
4039                         patt[pos] == '$')
4040                         break;
4041
4042                 /*
4043                  * XXX In multibyte character sets, we can't trust isalpha, so assume
4044                  * any multibyte char is potentially case-varying.
4045                  */
4046                 if (case_insensitive)
4047                 {
4048                         if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
4049                                 break;
4050                         if (isalpha((unsigned char) patt[pos]))
4051                                 break;
4052                 }
4053
4054                 /*
4055                  * Check for quantifiers.  Except for +, this means the preceding
4056                  * character is optional, so we must remove it from the prefix too!
4057                  * Note: in BREs, \{ is a quantifier.
4058                  */
4059                 if (patt[pos] == '*' ||
4060                         patt[pos] == '?' ||
4061                         patt[pos] == '{' ||
4062                         (patt[pos] == '\\' && patt[pos + 1] == '{'))
4063                 {
4064                         match_pos = prev_match_pos;
4065                         pos = prev_pos;
4066                         break;
4067                 }
4068                 if (patt[pos] == '+')
4069                 {
4070                         pos = prev_pos;
4071                         break;
4072                 }
4073
4074                 /*
4075                  * Normally, backslash quotes the next character.  But in AREs,
4076                  * backslash followed by alphanumeric is an escape, not a quoted
4077                  * character.  Must treat it as having multiple possible matches.
4078                  * In BREs, \( is a parenthesis, so don't trust that either.
4079                  * Note: since only ASCII alphanumerics are escapes, we don't have
4080                  * to be paranoid about multibyte here.
4081                  */
4082                 if (patt[pos] == '\\')
4083                 {
4084                         if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
4085                                 break;
4086                         pos++;
4087                         if (patt[pos] == '\0')
4088                                 break;
4089                 }
4090                 /* save position in case we need to back up on next loop cycle */
4091                 prev_match_pos = match_pos;
4092                 prev_pos = pos;
4093                 /* must use encoding-aware processing here */
4094                 len = pg_mblen(&patt[pos]);
4095                 memcpy(&match[match_pos], &patt[pos], len);
4096                 match_pos += len;
4097                 pos += len;
4098         }
4099
4100         match[match_pos] = '\0';
4101         rest = &patt[pos];
4102
4103         if (have_leading_paren && patt[pos] == ')')
4104                 pos++;
4105
4106         if (patt[pos] == '$' && patt[pos + 1] == '\0')
4107         {
4108                 rest = &patt[pos + 1];
4109
4110                 *prefix_const = string_to_const(match, typeid);
4111                 *rest_const = string_to_const(rest, typeid);
4112
4113                 pfree(patt);
4114                 pfree(match);
4115
4116                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
4117         }
4118
4119         *prefix_const = string_to_const(match, typeid);
4120         *rest_const = string_to_const(rest, typeid);
4121
4122         pfree(patt);
4123         pfree(match);
4124
4125         if (match_pos > 0)
4126                 return Pattern_Prefix_Partial;
4127
4128         return Pattern_Prefix_None;
4129 }
4130
4131 Pattern_Prefix_Status
4132 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
4133                                          Const **prefix, Const **rest)
4134 {
4135         Pattern_Prefix_Status result;
4136
4137         switch (ptype)
4138         {
4139                 case Pattern_Type_Like:
4140                         result = like_fixed_prefix(patt, false, prefix, rest);
4141                         break;
4142                 case Pattern_Type_Like_IC:
4143                         result = like_fixed_prefix(patt, true, prefix, rest);
4144                         break;
4145                 case Pattern_Type_Regex:
4146                         result = regex_fixed_prefix(patt, false, prefix, rest);
4147                         break;
4148                 case Pattern_Type_Regex_IC:
4149                         result = regex_fixed_prefix(patt, true, prefix, rest);
4150                         break;
4151                 default:
4152                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4153                         result = Pattern_Prefix_None;           /* keep compiler quiet */
4154                         break;
4155         }
4156         return result;
4157 }
4158
4159 /*
4160  * Estimate the selectivity of a fixed prefix for a pattern match.
4161  *
4162  * A fixed prefix "foo" is estimated as the selectivity of the expression
4163  * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
4164  *
4165  * The selectivity estimate is with respect to the portion of the column
4166  * population represented by the histogram --- the caller must fold this
4167  * together with info about MCVs and NULLs.
4168  *
4169  * We use the >= and < operators from the specified btree opfamily to do the
4170  * estimation.  The given variable and Const must be of the associated
4171  * datatype.
4172  *
4173  * XXX Note: we make use of the upper bound to estimate operator selectivity
4174  * even if the locale is such that we cannot rely on the upper-bound string.
4175  * The selectivity only needs to be approximately right anyway, so it seems
4176  * more useful to use the upper-bound code than not.
4177  */
4178 static Selectivity
4179 prefix_selectivity(VariableStatData *vardata,
4180                                    Oid vartype, Oid opfamily, Const *prefixcon)
4181 {
4182         Selectivity prefixsel;
4183         Oid                     cmpopr;
4184         FmgrInfo        opproc;
4185         Const      *greaterstrcon;
4186
4187         cmpopr = get_opfamily_member(opfamily, vartype, vartype,
4188                                                                  BTGreaterEqualStrategyNumber);
4189         if (cmpopr == InvalidOid)
4190                 elog(ERROR, "no >= operator for opfamily %u", opfamily);
4191         fmgr_info(get_opcode(cmpopr), &opproc);
4192
4193         prefixsel = ineq_histogram_selectivity(vardata, &opproc, true,
4194                                                                                    prefixcon->constvalue,
4195                                                                                    prefixcon->consttype);
4196
4197         if (prefixsel <= 0.0)
4198         {
4199                 /* No histogram is present ... return a suitable default estimate */
4200                 return 0.005;
4201         }
4202
4203         /*-------
4204          * If we can create a string larger than the prefix, say
4205          *      "x < greaterstr".
4206          *-------
4207          */
4208         greaterstrcon = make_greater_string(prefixcon);
4209         if (greaterstrcon)
4210         {
4211                 Selectivity topsel;
4212
4213                 cmpopr = get_opfamily_member(opfamily, vartype, vartype,
4214                                                                          BTLessStrategyNumber);
4215                 if (cmpopr == InvalidOid)
4216                         elog(ERROR, "no < operator for opfamily %u", opfamily);
4217                 fmgr_info(get_opcode(cmpopr), &opproc);
4218
4219                 topsel = ineq_histogram_selectivity(vardata, &opproc, false,
4220                                                                                         greaterstrcon->constvalue,
4221                                                                                         greaterstrcon->consttype);
4222
4223                 /* ineq_histogram_selectivity worked before, it shouldn't fail now */
4224                 Assert(topsel > 0.0);
4225
4226                 /*
4227                  * Merge the two selectivities in the same way as for a range query
4228                  * (see clauselist_selectivity()).      Note that we don't need to worry
4229                  * about double-exclusion of nulls, since ineq_histogram_selectivity
4230                  * doesn't count those anyway.
4231                  */
4232                 prefixsel = topsel + prefixsel - 1.0;
4233
4234                 /*
4235                  * A zero or negative prefixsel should be converted into a small
4236                  * positive value; we probably are dealing with a very tight range and
4237                  * got a bogus result due to roundoff errors.
4238                  */
4239                 if (prefixsel <= 0.0)
4240                         prefixsel = 1.0e-10;
4241         }
4242
4243         return prefixsel;
4244 }
4245
4246
4247 /*
4248  * Estimate the selectivity of a pattern of the specified type.
4249  * Note that any fixed prefix of the pattern will have been removed already.
4250  *
4251  * For now, we use a very simplistic approach: fixed characters reduce the
4252  * selectivity a good deal, character ranges reduce it a little,
4253  * wildcards (such as % for LIKE or .* for regex) increase it.
4254  */
4255
4256 #define FIXED_CHAR_SEL  0.20    /* about 1/5 */
4257 #define CHAR_RANGE_SEL  0.25
4258 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match end-of-string */
4259 #define FULL_WILDCARD_SEL 5.0
4260 #define PARTIAL_WILDCARD_SEL 2.0
4261
4262 static Selectivity
4263 like_selectivity(Const *patt_const, bool case_insensitive)
4264 {
4265         Selectivity sel = 1.0;
4266         int                     pos;
4267         Oid                     typeid = patt_const->consttype;
4268         char       *patt;
4269         int                     pattlen;
4270
4271         /* the right-hand const is type text or bytea */
4272         Assert(typeid == BYTEAOID || typeid == TEXTOID);
4273
4274         if (typeid == BYTEAOID && case_insensitive)
4275                 ereport(ERROR,
4276                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4277                    errmsg("case insensitive matching not supported on type bytea")));
4278
4279         if (typeid != BYTEAOID)
4280         {
4281                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4282                 pattlen = strlen(patt);
4283         }
4284         else
4285         {
4286                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
4287
4288                 pattlen = VARSIZE(bstr) - VARHDRSZ;
4289                 patt = (char *) palloc(pattlen);
4290                 memcpy(patt, VARDATA(bstr), pattlen);
4291                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
4292                         pfree(bstr);
4293         }
4294
4295         /* Skip any leading wildcard; it's already factored into initial sel */
4296         for (pos = 0; pos < pattlen; pos++)
4297         {
4298                 if (patt[pos] != '%' && patt[pos] != '_')
4299                         break;
4300         }
4301
4302         for (; pos < pattlen; pos++)
4303         {
4304                 /* % and _ are wildcard characters in LIKE */
4305                 if (patt[pos] == '%')
4306                         sel *= FULL_WILDCARD_SEL;
4307                 else if (patt[pos] == '_')
4308                         sel *= ANY_CHAR_SEL;
4309                 else if (patt[pos] == '\\')
4310                 {
4311                         /* Backslash quotes the next character */
4312                         pos++;
4313                         if (pos >= pattlen)
4314                                 break;
4315                         sel *= FIXED_CHAR_SEL;
4316                 }
4317                 else
4318                         sel *= FIXED_CHAR_SEL;
4319         }
4320         /* Could get sel > 1 if multiple wildcards */
4321         if (sel > 1.0)
4322                 sel = 1.0;
4323
4324         pfree(patt);
4325         return sel;
4326 }
4327
4328 static Selectivity
4329 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
4330 {
4331         Selectivity sel = 1.0;
4332         int                     paren_depth = 0;
4333         int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
4334         int                     pos;
4335
4336         for (pos = 0; pos < pattlen; pos++)
4337         {
4338                 if (patt[pos] == '(')
4339                 {
4340                         if (paren_depth == 0)
4341                                 paren_pos = pos;        /* remember start of parenthesized item */
4342                         paren_depth++;
4343                 }
4344                 else if (patt[pos] == ')' && paren_depth > 0)
4345                 {
4346                         paren_depth--;
4347                         if (paren_depth == 0)
4348                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
4349                                                                                          pos - (paren_pos + 1),
4350                                                                                          case_insensitive);
4351                 }
4352                 else if (patt[pos] == '|' && paren_depth == 0)
4353                 {
4354                         /*
4355                          * If unquoted | is present at paren level 0 in pattern, we have
4356                          * multiple alternatives; sum their probabilities.
4357                          */
4358                         sel += regex_selectivity_sub(patt + (pos + 1),
4359                                                                                  pattlen - (pos + 1),
4360                                                                                  case_insensitive);
4361                         break;                          /* rest of pattern is now processed */
4362                 }
4363                 else if (patt[pos] == '[')
4364                 {
4365                         bool            negclass = false;
4366
4367                         if (patt[++pos] == '^')
4368                         {
4369                                 negclass = true;
4370                                 pos++;
4371                         }
4372                         if (patt[pos] == ']')           /* ']' at start of class is not
4373                                                                                  * special */
4374                                 pos++;
4375                         while (pos < pattlen && patt[pos] != ']')
4376                                 pos++;
4377                         if (paren_depth == 0)
4378                                 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
4379                 }
4380                 else if (patt[pos] == '.')
4381                 {
4382                         if (paren_depth == 0)
4383                                 sel *= ANY_CHAR_SEL;
4384                 }
4385                 else if (patt[pos] == '*' ||
4386                                  patt[pos] == '?' ||
4387                                  patt[pos] == '+')
4388                 {
4389                         /* Ought to be smarter about quantifiers... */
4390                         if (paren_depth == 0)
4391                                 sel *= PARTIAL_WILDCARD_SEL;
4392                 }
4393                 else if (patt[pos] == '{')
4394                 {
4395                         while (pos < pattlen && patt[pos] != '}')
4396                                 pos++;
4397                         if (paren_depth == 0)
4398                                 sel *= PARTIAL_WILDCARD_SEL;
4399                 }
4400                 else if (patt[pos] == '\\')
4401                 {
4402                         /* backslash quotes the next character */
4403                         pos++;
4404                         if (pos >= pattlen)
4405                                 break;
4406                         if (paren_depth == 0)
4407                                 sel *= FIXED_CHAR_SEL;
4408                 }
4409                 else
4410                 {
4411                         if (paren_depth == 0)
4412                                 sel *= FIXED_CHAR_SEL;
4413                 }
4414         }
4415         /* Could get sel > 1 if multiple wildcards */
4416         if (sel > 1.0)
4417                 sel = 1.0;
4418         return sel;
4419 }
4420
4421 static Selectivity
4422 regex_selectivity(Const *patt_const, bool case_insensitive)
4423 {
4424         Selectivity sel;
4425         char       *patt;
4426         int                     pattlen;
4427         Oid                     typeid = patt_const->consttype;
4428
4429         /*
4430          * Should be unnecessary, there are no bytea regex operators defined. As
4431          * such, it should be noted that the rest of this function has *not* been
4432          * made safe for binary (possibly NULL containing) strings.
4433          */
4434         if (typeid == BYTEAOID)
4435                 ereport(ERROR,
4436                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4437                  errmsg("regular-expression matching not supported on type bytea")));
4438
4439         /* the right-hand const is type text for all of these */
4440         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4441         pattlen = strlen(patt);
4442
4443         /* If patt doesn't end with $, consider it to have a trailing wildcard */
4444         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
4445                 (pattlen == 1 || patt[pattlen - 2] != '\\'))
4446         {
4447                 /* has trailing $ */
4448                 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
4449         }
4450         else
4451         {
4452                 /* no trailing $ */
4453                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
4454                 sel *= FULL_WILDCARD_SEL;
4455                 if (sel > 1.0)
4456                         sel = 1.0;
4457         }
4458         return sel;
4459 }
4460
4461 static Selectivity
4462 pattern_selectivity(Const *patt, Pattern_Type ptype)
4463 {
4464         Selectivity result;
4465
4466         switch (ptype)
4467         {
4468                 case Pattern_Type_Like:
4469                         result = like_selectivity(patt, false);
4470                         break;
4471                 case Pattern_Type_Like_IC:
4472                         result = like_selectivity(patt, true);
4473                         break;
4474                 case Pattern_Type_Regex:
4475                         result = regex_selectivity(patt, false);
4476                         break;
4477                 case Pattern_Type_Regex_IC:
4478                         result = regex_selectivity(patt, true);
4479                         break;
4480                 default:
4481                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4482                         result = 1.0;           /* keep compiler quiet */
4483                         break;
4484         }
4485         return result;
4486 }
4487
4488
4489 /*
4490  * Try to generate a string greater than the given string or any
4491  * string it is a prefix of.  If successful, return a palloc'd string
4492  * in the form of a Const pointer; else return NULL.
4493  *
4494  * The key requirement here is that given a prefix string, say "foo",
4495  * we must be able to generate another string "fop" that is greater
4496  * than all strings "foobar" starting with "foo".
4497  *
4498  * If we max out the righthand byte, truncate off the last character
4499  * and start incrementing the next.  For example, if "z" were the last
4500  * character in the sort order, then we could produce "foo" as a
4501  * string greater than "fonz".
4502  *
4503  * This could be rather slow in the worst case, but in most cases we
4504  * won't have to try more than one or two strings before succeeding.
4505  *
4506  * NOTE: at present this assumes we are in the C locale, so that simple
4507  * bytewise comparison applies.  However, we might be in a multibyte
4508  * encoding such as UTF8, so we do have to watch out for generating
4509  * invalid encoding sequences.
4510  */
4511 Const *
4512 make_greater_string(const Const *str_const)
4513 {
4514         Oid                     datatype = str_const->consttype;
4515         char       *workstr;
4516         int                     len;
4517
4518         /* Get the string and a modifiable copy */
4519         if (datatype == NAMEOID)
4520         {
4521                 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4522                                                                                                           str_const->constvalue));
4523                 len = strlen(workstr);
4524         }
4525         else if (datatype == BYTEAOID)
4526         {
4527                 bytea      *bstr = DatumGetByteaP(str_const->constvalue);
4528
4529                 len = VARSIZE(bstr) - VARHDRSZ;
4530                 workstr = (char *) palloc(len);
4531                 memcpy(workstr, VARDATA(bstr), len);
4532                 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4533                         pfree(bstr);
4534         }
4535         else
4536         {
4537                 workstr = DatumGetCString(DirectFunctionCall1(textout,
4538                                                                                                           str_const->constvalue));
4539                 len = strlen(workstr);
4540         }
4541
4542         while (len > 0)
4543         {
4544                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4545                 unsigned char savelastchar = *lastchar;
4546
4547                 /*
4548                  * Try to generate a larger string by incrementing the last byte.
4549                  */
4550                 while (*lastchar < (unsigned char) 255)
4551                 {
4552                         Const      *workstr_const;
4553
4554                         (*lastchar)++;
4555
4556                         if (datatype != BYTEAOID)
4557                         {
4558                                 /* do not generate invalid encoding sequences */
4559                                 if (!pg_verifymbstr(workstr, len, true))
4560                                         continue;
4561                                 workstr_const = string_to_const(workstr, datatype);
4562                         }
4563                         else
4564                                 workstr_const = string_to_bytea_const(workstr, len);
4565
4566                         pfree(workstr);
4567                         return workstr_const;
4568                 }
4569
4570                 /* restore last byte so we don't confuse pg_mbcliplen */
4571                 *lastchar = savelastchar;
4572
4573                 /*
4574                  * Truncate off the last character, which might be more than 1 byte,
4575                  * depending on the character encoding.
4576                  */
4577                 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4578                         len = pg_mbcliplen(workstr, len, len - 1);
4579                 else
4580                         len -= 1;
4581
4582                 if (datatype != BYTEAOID)
4583                         workstr[len] = '\0';
4584         }
4585
4586         /* Failed... */
4587         pfree(workstr);
4588
4589         return NULL;
4590 }
4591
4592 /*
4593  * Generate a Datum of the appropriate type from a C string.
4594  * Note that all of the supported types are pass-by-ref, so the
4595  * returned value should be pfree'd if no longer needed.
4596  */
4597 static Datum
4598 string_to_datum(const char *str, Oid datatype)
4599 {
4600         Assert(str != NULL);
4601
4602         /*
4603          * We cheat a little by assuming that textin() will do for bpchar and
4604          * varchar constants too...
4605          */
4606         if (datatype == NAMEOID)
4607                 return DirectFunctionCall1(namein, CStringGetDatum(str));
4608         else if (datatype == BYTEAOID)
4609                 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4610         else
4611                 return DirectFunctionCall1(textin, CStringGetDatum(str));
4612 }
4613
4614 /*
4615  * Generate a Const node of the appropriate type from a C string.
4616  */
4617 static Const *
4618 string_to_const(const char *str, Oid datatype)
4619 {
4620         Datum           conval = string_to_datum(str, datatype);
4621
4622         return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4623                                          conval, false, false);
4624 }
4625
4626 /*
4627  * Generate a Const node of bytea type from a binary C string and a length.
4628  */
4629 static Const *
4630 string_to_bytea_const(const char *str, size_t str_len)
4631 {
4632         bytea      *bstr = palloc(VARHDRSZ + str_len);
4633         Datum           conval;
4634
4635         memcpy(VARDATA(bstr), str, str_len);
4636         VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4637         conval = PointerGetDatum(bstr);
4638
4639         return makeConst(BYTEAOID, -1, conval, false, false);
4640 }
4641
4642 /*-------------------------------------------------------------------------
4643  *
4644  * Index cost estimation functions
4645  *
4646  * genericcostestimate is a general-purpose estimator for use when we
4647  * don't have any better idea about how to estimate.  Index-type-specific
4648  * knowledge can be incorporated in the type-specific routines.
4649  *
4650  * One bit of index-type-specific knowledge we can relatively easily use
4651  * in genericcostestimate is the estimate of the number of index tuples
4652  * visited.  If numIndexTuples is not 0 then it is used as the estimate,
4653  * otherwise we compute a generic estimate.
4654  *
4655  *-------------------------------------------------------------------------
4656  */
4657
4658 static void
4659 genericcostestimate(PlannerInfo *root,
4660                                         IndexOptInfo *index, List *indexQuals,
4661                                         RelOptInfo *outer_rel,
4662                                         double numIndexTuples,
4663                                         Cost *indexStartupCost,
4664                                         Cost *indexTotalCost,
4665                                         Selectivity *indexSelectivity,
4666                                         double *indexCorrelation)
4667 {
4668         double          numIndexPages;
4669         double          num_sa_scans;
4670         double          num_outer_scans;
4671         double          num_scans;
4672         QualCost        index_qual_cost;
4673         double          qual_op_cost;
4674         double          qual_arg_cost;
4675         List       *selectivityQuals;
4676         ListCell   *l;
4677
4678         /*
4679          * If the index is partial, AND the index predicate with the explicitly
4680          * given indexquals to produce a more accurate idea of the index
4681          * selectivity.  This may produce redundant clauses.  We get rid of exact
4682          * duplicates in the code below.  We expect that most cases of partial
4683          * redundancy (such as "x < 4" from the qual and "x < 5" from the
4684          * predicate) will be recognized and handled correctly by
4685          * clauselist_selectivity().  This assumption is somewhat fragile, since
4686          * it depends on predicate_implied_by() and clauselist_selectivity()
4687          * having similar capabilities, and there are certainly many cases where
4688          * we will end up with a too-low selectivity estimate.  This will bias the
4689          * system in favor of using partial indexes where possible, which is not
4690          * necessarily a bad thing. But it'd be nice to do better someday.
4691          *
4692          * Note that index->indpred and indexQuals are both in implicit-AND form,
4693          * so ANDing them together just takes merging the lists.  However,
4694          * eliminating duplicates is a bit trickier because indexQuals contains
4695          * RestrictInfo nodes and the indpred does not.  It is okay to pass a
4696          * mixed list to clauselist_selectivity, but we have to work a bit to
4697          * generate a list without logical duplicates.  (We could just list_union
4698          * indpred and strippedQuals, but then we'd not get caching of per-qual
4699          * selectivity estimates.)
4700          */
4701         if (index->indpred != NIL)
4702         {
4703                 List       *strippedQuals;
4704                 List       *predExtraQuals;
4705
4706                 strippedQuals = get_actual_clauses(indexQuals);
4707                 predExtraQuals = list_difference(index->indpred, strippedQuals);
4708                 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4709         }
4710         else
4711                 selectivityQuals = indexQuals;
4712
4713         /*
4714          * Check for ScalarArrayOpExpr index quals, and estimate the number of
4715          * index scans that will be performed.
4716          */
4717         num_sa_scans = 1;
4718         foreach(l, indexQuals)
4719         {
4720                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4721
4722                 if (IsA(rinfo->clause, ScalarArrayOpExpr))
4723                 {
4724                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause;
4725                         int                     alength = estimate_array_length(lsecond(saop->args));
4726
4727                         if (alength > 1)
4728                                 num_sa_scans *= alength;
4729                 }
4730         }
4731
4732         /* Estimate the fraction of main-table tuples that will be visited */
4733         *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4734                                                                                            index->rel->relid,
4735                                                                                            JOIN_INNER);
4736
4737         /*
4738          * If caller didn't give us an estimate, estimate the number of index
4739          * tuples that will be visited.  We do it in this rather peculiar-looking
4740          * way in order to get the right answer for partial indexes.
4741          */
4742         if (numIndexTuples <= 0.0)
4743         {
4744                 numIndexTuples = *indexSelectivity * index->rel->tuples;
4745
4746                 /*
4747                  * The above calculation counts all the tuples visited across all
4748                  * scans induced by ScalarArrayOpExpr nodes.  We want to consider the
4749                  * average per-indexscan number, so adjust.  This is a handy place to
4750                  * round to integer, too.  (If caller supplied tuple estimate, it's
4751                  * responsible for handling these considerations.)
4752                  */
4753                 numIndexTuples = rint(numIndexTuples / num_sa_scans);
4754         }
4755
4756         /*
4757          * We can bound the number of tuples by the index size in any case. Also,
4758          * always estimate at least one tuple is touched, even when
4759          * indexSelectivity estimate is tiny.
4760          */
4761         if (numIndexTuples > index->tuples)
4762                 numIndexTuples = index->tuples;
4763         if (numIndexTuples < 1.0)
4764                 numIndexTuples = 1.0;
4765
4766         /*
4767          * Estimate the number of index pages that will be retrieved.
4768          *
4769          * We use the simplistic method of taking a pro-rata fraction of the total
4770          * number of index pages.  In effect, this counts only leaf pages and not
4771          * any overhead such as index metapage or upper tree levels. In practice
4772          * this seems a better approximation than charging for access to the upper
4773          * levels, perhaps because those tend to stay in cache under load.
4774          */
4775         if (index->pages > 1 && index->tuples > 1)
4776                 numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
4777         else
4778                 numIndexPages = 1.0;
4779
4780         /*
4781          * Now compute the disk access costs.
4782          *
4783          * The above calculations are all per-index-scan.  However, if we are in a
4784          * nestloop inner scan, we can expect the scan to be repeated (with
4785          * different search keys) for each row of the outer relation.  Likewise,
4786          * ScalarArrayOpExpr quals result in multiple index scans.      This creates
4787          * the potential for cache effects to reduce the number of disk page
4788          * fetches needed.      We want to estimate the average per-scan I/O cost in
4789          * the presence of caching.
4790          *
4791          * We use the Mackert-Lohman formula (see costsize.c for details) to
4792          * estimate the total number of page fetches that occur.  While this
4793          * wasn't what it was designed for, it seems a reasonable model anyway.
4794          * Note that we are counting pages not tuples anymore, so we take N = T =
4795          * index size, as if there were one "tuple" per page.
4796          */
4797         if (outer_rel != NULL && outer_rel->rows > 1)
4798         {
4799                 num_outer_scans = outer_rel->rows;
4800                 num_scans = num_sa_scans * num_outer_scans;
4801         }
4802         else
4803         {
4804                 num_outer_scans = 1;
4805                 num_scans = num_sa_scans;
4806         }
4807
4808         if (num_scans > 1)
4809         {
4810                 double          pages_fetched;
4811
4812                 /* total page fetches ignoring cache effects */
4813                 pages_fetched = numIndexPages * num_scans;
4814
4815                 /* use Mackert and Lohman formula to adjust for cache effects */
4816                 pages_fetched = index_pages_fetched(pages_fetched,
4817                                                                                         index->pages,
4818                                                                                         (double) index->pages,
4819                                                                                         root);
4820
4821                 /*
4822                  * Now compute the total disk access cost, and then report a pro-rated
4823                  * share for each outer scan.  (Don't pro-rate for ScalarArrayOpExpr,
4824                  * since that's internal to the indexscan.)
4825                  */
4826                 *indexTotalCost = (pages_fetched * random_page_cost) / num_outer_scans;
4827         }
4828         else
4829         {
4830                 /*
4831                  * For a single index scan, we just charge random_page_cost per page
4832                  * touched.
4833                  */
4834                 *indexTotalCost = numIndexPages * random_page_cost;
4835         }
4836
4837         /*
4838          * A difficulty with the leaf-pages-only cost approach is that for small
4839          * selectivities (eg, single index tuple fetched) all indexes will look
4840          * equally attractive because we will estimate exactly 1 leaf page to be
4841          * fetched.  All else being equal, we should prefer physically smaller
4842          * indexes over larger ones.  (An index might be smaller because it is
4843          * partial or because it contains fewer columns; presumably the other
4844          * columns in the larger index aren't useful to the query, or the larger
4845          * index would have better selectivity.)
4846          *
4847          * We can deal with this by adding a very small "fudge factor" that
4848          * depends on the index size.  The fudge factor used here is one
4849          * random_page_cost per 100000 index pages, which should be small enough
4850          * to not alter index-vs-seqscan decisions, but will prevent indexes of
4851          * different sizes from looking exactly equally attractive.
4852          */
4853         *indexTotalCost += index->pages * random_page_cost / 100000.0;
4854
4855         /*
4856          * CPU cost: any complex expressions in the indexquals will need to be
4857          * evaluated once at the start of the scan to reduce them to runtime keys
4858          * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
4859          * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
4860          * indexqual operator.  Because we have numIndexTuples as a per-scan
4861          * number, we have to multiply by num_sa_scans to get the correct result
4862          * for ScalarArrayOpExpr cases.
4863          *
4864          * Note: this neglects the possible costs of rechecking lossy operators
4865          * and OR-clause expressions.  Detecting that that might be needed seems
4866          * more expensive than it's worth, though, considering all the other
4867          * inaccuracies here ...
4868          */
4869         cost_qual_eval(&index_qual_cost, indexQuals);
4870         qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4871         qual_arg_cost = index_qual_cost.startup +
4872                 index_qual_cost.per_tuple - qual_op_cost;
4873         if (qual_arg_cost < 0)          /* just in case... */
4874                 qual_arg_cost = 0;
4875         *indexStartupCost = qual_arg_cost;
4876         *indexTotalCost += qual_arg_cost;
4877         *indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
4878
4879         /*
4880          * We also add a CPU-cost component to represent the general costs of
4881          * starting an indexscan, such as analysis of btree index keys and
4882          * initial tree descent.  This is estimated at 100x cpu_operator_cost,
4883          * which is a bit arbitrary but seems the right order of magnitude.
4884          * (As noted above, we don't charge any I/O for touching upper tree
4885          * levels, but charging nothing at all has been found too optimistic.)
4886          *
4887          * Although this is startup cost with respect to any one scan, we add
4888          * it to the "total" cost component because it's only very interesting
4889          * in the many-ScalarArrayOpExpr-scan case, and there it will be paid
4890          * over the life of the scan node.
4891          */
4892         *indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost;
4893
4894         /*
4895          * Generic assumption about index correlation: there isn't any.
4896          */
4897         *indexCorrelation = 0.0;
4898 }
4899
4900
4901 Datum
4902 btcostestimate(PG_FUNCTION_ARGS)
4903 {
4904         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4905         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4906         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4907         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
4908         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
4909         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
4910         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
4911         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
4912         Oid                     relid;
4913         AttrNumber      colnum;
4914         HeapTuple       tuple;
4915         double          numIndexTuples;
4916         List       *indexBoundQuals;
4917         int                     indexcol;
4918         bool            eqQualHere;
4919         bool            found_saop;
4920         double          num_sa_scans;
4921         ListCell   *l;
4922
4923         /*
4924          * For a btree scan, only leading '=' quals plus inequality quals for the
4925          * immediately next attribute contribute to index selectivity (these are
4926          * the "boundary quals" that determine the starting and stopping points of
4927          * the index scan).  Additional quals can suppress visits to the heap, so
4928          * it's OK to count them in indexSelectivity, but they should not count
4929          * for estimating numIndexTuples.  So we must examine the given indexQuals
4930          * to find out which ones count as boundary quals.      We rely on the
4931          * knowledge that they are given in index column order.
4932          *
4933          * For a RowCompareExpr, we consider only the first column, just as
4934          * rowcomparesel() does.
4935          *
4936          * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N
4937          * index scans not one, but the ScalarArrayOpExpr's operator can be
4938          * considered to act the same as it normally does.
4939          */
4940         indexBoundQuals = NIL;
4941         indexcol = 0;
4942         eqQualHere = false;
4943         found_saop = false;
4944         num_sa_scans = 1;
4945         foreach(l, indexQuals)
4946         {
4947                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4948                 Expr       *clause;
4949                 Node       *leftop,
4950                                    *rightop;
4951                 Oid                     clause_op;
4952                 int                     op_strategy;
4953
4954                 Assert(IsA(rinfo, RestrictInfo));
4955                 clause = rinfo->clause;
4956                 if (IsA(clause, OpExpr))
4957                 {
4958                         leftop = get_leftop(clause);
4959                         rightop = get_rightop(clause);
4960                         clause_op = ((OpExpr *) clause)->opno;
4961                 }
4962                 else if (IsA(clause, RowCompareExpr))
4963                 {
4964                         RowCompareExpr *rc = (RowCompareExpr *) clause;
4965
4966                         leftop = (Node *) linitial(rc->largs);
4967                         rightop = (Node *) linitial(rc->rargs);
4968                         clause_op = linitial_oid(rc->opnos);
4969                 }
4970                 else if (IsA(clause, ScalarArrayOpExpr))
4971                 {
4972                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
4973
4974                         leftop = (Node *) linitial(saop->args);
4975                         rightop = (Node *) lsecond(saop->args);
4976                         clause_op = saop->opno;
4977                         found_saop = true;
4978                 }
4979                 else
4980                 {
4981                         elog(ERROR, "unsupported indexqual type: %d",
4982                                  (int) nodeTag(clause));
4983                         continue;                       /* keep compiler quiet */
4984                 }
4985                 if (match_index_to_operand(leftop, indexcol, index))
4986                 {
4987                         /* clause_op is correct */
4988                 }
4989                 else if (match_index_to_operand(rightop, indexcol, index))
4990                 {
4991                         /* Must flip operator to get the opfamily member */
4992                         clause_op = get_commutator(clause_op);
4993                 }
4994                 else
4995                 {
4996                         /* Must be past the end of quals for indexcol, try next */
4997                         if (!eqQualHere)
4998                                 break;                  /* done if no '=' qual for indexcol */
4999                         indexcol++;
5000                         eqQualHere = false;
5001                         if (match_index_to_operand(leftop, indexcol, index))
5002                         {
5003                                 /* clause_op is correct */
5004                         }
5005                         else if (match_index_to_operand(rightop, indexcol, index))
5006                         {
5007                                 /* Must flip operator to get the opfamily member */
5008                                 clause_op = get_commutator(clause_op);
5009                         }
5010                         else
5011                         {
5012                                 /* No quals for new indexcol, so we are done */
5013                                 break;
5014                         }
5015                 }
5016                 op_strategy = get_op_opfamily_strategy(clause_op,
5017                                                                                            index->opfamily[indexcol]);
5018                 Assert(op_strategy != 0);               /* not a member of opfamily?? */
5019                 if (op_strategy == BTEqualStrategyNumber)
5020                         eqQualHere = true;
5021                 /* count up number of SA scans induced by indexBoundQuals only */
5022                 if (IsA(clause, ScalarArrayOpExpr))
5023                 {
5024                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
5025                         int                     alength = estimate_array_length(lsecond(saop->args));
5026
5027                         if (alength > 1)
5028                                 num_sa_scans *= alength;
5029                 }
5030                 indexBoundQuals = lappend(indexBoundQuals, rinfo);
5031         }
5032
5033         /*
5034          * If index is unique and we found an '=' clause for each column, we can
5035          * just assume numIndexTuples = 1 and skip the expensive
5036          * clauselist_selectivity calculations.
5037          */
5038         if (index->unique &&
5039                 indexcol == index->ncolumns - 1 &&
5040                 eqQualHere &&
5041                 !found_saop)
5042                 numIndexTuples = 1.0;
5043         else
5044         {
5045                 Selectivity btreeSelectivity;
5046
5047                 btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
5048                                                                                                   index->rel->relid,
5049                                                                                                   JOIN_INNER);
5050                 numIndexTuples = btreeSelectivity * index->rel->tuples;
5051                 /*
5052                  * As in genericcostestimate(), we have to adjust for any
5053                  * ScalarArrayOpExpr quals included in indexBoundQuals, and then
5054                  * round to integer.
5055                  */
5056                 numIndexTuples = rint(numIndexTuples / num_sa_scans);
5057         }
5058
5059         genericcostestimate(root, index, indexQuals, outer_rel, numIndexTuples,
5060                                                 indexStartupCost, indexTotalCost,
5061                                                 indexSelectivity, indexCorrelation);
5062
5063         /*
5064          * If we can get an estimate of the first column's ordering correlation C
5065          * from pg_statistic, estimate the index correlation as C for a
5066          * single-column index, or C * 0.75 for multiple columns. (The idea here
5067          * is that multiple columns dilute the importance of the first column's
5068          * ordering, but don't negate it entirely.  Before 8.0 we divided the
5069          * correlation by the number of columns, but that seems too strong.)
5070          *
5071          * We can skip all this if we found a ScalarArrayOpExpr, because then the
5072          * call must be for a bitmap index scan, and the caller isn't going to
5073          * care what the index correlation is.
5074          */
5075         if (found_saop)
5076                 PG_RETURN_VOID();
5077
5078         if (index->indexkeys[0] != 0)
5079         {
5080                 /* Simple variable --- look to stats for the underlying table */
5081                 relid = getrelid(index->rel->relid, root->parse->rtable);
5082                 Assert(relid != InvalidOid);
5083                 colnum = index->indexkeys[0];
5084         }
5085         else
5086         {
5087                 /* Expression --- maybe there are stats for the index itself */
5088                 relid = index->indexoid;
5089                 colnum = 1;
5090         }
5091
5092         tuple = SearchSysCache(STATRELATT,
5093                                                    ObjectIdGetDatum(relid),
5094                                                    Int16GetDatum(colnum),
5095                                                    0, 0);
5096
5097         if (HeapTupleIsValid(tuple))
5098         {
5099                 float4     *numbers;
5100                 int                     nnumbers;
5101
5102                 if (get_attstatsslot(tuple, InvalidOid, 0,
5103                                                          STATISTIC_KIND_CORRELATION,
5104                                                          index->fwdsortop[0],
5105                                                          NULL, NULL, &numbers, &nnumbers))
5106                 {
5107                         double          varCorrelation;
5108
5109                         Assert(nnumbers == 1);
5110                         varCorrelation = numbers[0];
5111
5112                         if (index->ncolumns > 1)
5113                                 *indexCorrelation = varCorrelation * 0.75;
5114                         else
5115                                 *indexCorrelation = varCorrelation;
5116
5117                         free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
5118                 }
5119                 else if (get_attstatsslot(tuple, InvalidOid, 0,
5120                                                                   STATISTIC_KIND_CORRELATION,
5121                                                                   index->revsortop[0],
5122                                                                   NULL, NULL, &numbers, &nnumbers))
5123                 {
5124                         double          varCorrelation;
5125
5126                         Assert(nnumbers == 1);
5127                         varCorrelation = numbers[0];
5128
5129                         if (index->ncolumns > 1)
5130                                 *indexCorrelation = - varCorrelation * 0.75;
5131                         else
5132                                 *indexCorrelation = - varCorrelation;
5133
5134                         free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
5135                 }
5136                 ReleaseSysCache(tuple);
5137         }
5138
5139         PG_RETURN_VOID();
5140 }
5141
5142 Datum
5143 hashcostestimate(PG_FUNCTION_ARGS)
5144 {
5145         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5146         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5147         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
5148         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5149         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5150         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5151         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5152         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5153
5154         genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5155                                                 indexStartupCost, indexTotalCost,
5156                                                 indexSelectivity, indexCorrelation);
5157
5158         PG_RETURN_VOID();
5159 }
5160
5161 Datum
5162 gistcostestimate(PG_FUNCTION_ARGS)
5163 {
5164         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5165         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5166         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
5167         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5168         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5169         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5170         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5171         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5172
5173         genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5174                                                 indexStartupCost, indexTotalCost,
5175                                                 indexSelectivity, indexCorrelation);
5176
5177         PG_RETURN_VOID();
5178 }
5179
5180 Datum
5181 gincostestimate(PG_FUNCTION_ARGS)
5182 {
5183         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
5184         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
5185         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
5186         RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
5187         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
5188         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
5189         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
5190         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
5191
5192         genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
5193                                                 indexStartupCost, indexTotalCost,
5194                                                 indexSelectivity, indexCorrelation);
5195
5196         PG_RETURN_VOID();
5197 }