granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * selfuncs.c
   4  *        Selectivity functions and index cost estimation functions for
   5  *        standard operators and index access methods.
   6  *
   7  *        Selectivity routines are registered in the pg_operator catalog
   8  *        in the "oprrest" and "oprjoin" attributes.
   9  *
  10  *        Index cost functions are registered in the pg_am catalog
  11  *        in the "amcostestimate" attribute.
  12  *
  13  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
  14  * Portions Copyright (c) 1994, Regents of the University of California
  15  *
  16  *
  17  * IDENTIFICATION
  18  *        $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.104 2002/03/01 04:09:25 tgl Exp $
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22
  23 /*----------
  24  * Operator selectivity estimation functions are called to estimate the
  25  * selectivity of WHERE clauses whose top-level operator is their operator.
  26  * We divide the problem into two cases:
  27  *              Restriction clause estimation: the clause involves vars of just
  28  *                      one relation.
  29  *              Join clause estimation: the clause involves vars of multiple rels.
  30  * Join selectivity estimation is far more difficult and usually less accurate
  31  * than restriction estimation.
  32  *
  33  * When dealing with the inner scan of a nestloop join, we consider the
  34  * join's joinclauses as restriction clauses for the inner relation, and
  35  * treat vars of the outer relation as parameters (a/k/a constants of unknown
  36  * values).  So, restriction estimators need to be able to accept an argument
  37  * telling which relation is to be treated as the variable.
  38  *
  39  * The call convention for a restriction estimator (oprrest function) is
  40  *
  41  *              Selectivity oprrest (Query *root,
  42  *                                                       Oid operator,
  43  *                                                       List *args,
  44  *                                                       int varRelid);
  45  *
  46  * root: general information about the query (rtable and RelOptInfo lists
  47  * are particularly important for the estimator).
  48  * operator: OID of the specific operator in question.
  49  * args: argument list from the operator clause.
  50  * varRelid: if not zero, the relid (rtable index) of the relation to
  51  * be treated as the variable relation.  May be zero if the args list
  52  * is known to contain vars of only one relation.
  53  *
  54  * This is represented at the SQL level (in pg_proc) as
  55  *
  56  *              float8 oprrest (opaque, oid, opaque, int4);
  57  *
  58  * The call convention for a join estimator (oprjoin function) is similar
  59  * except that varRelid is not needed:
  60  *
  61  *              Selectivity oprjoin (Query *root,
  62  *                                                       Oid operator,
  63  *                                                       List *args);
  64  *
  65  *              float8 oprjoin (opaque, oid, opaque);
  66  *----------
  67  */
  68
  69 #include "postgres.h"
  70
  71 #include <ctype.h>
  72 #include <math.h>
  73 #ifdef USE_LOCALE
  74 #include <locale.h>
  75 #endif
  76
  77 #include "access/heapam.h"
  78 #include "catalog/catname.h"
  79 #include "catalog/pg_operator.h"
  80 #include "catalog/pg_proc.h"
  81 #include "catalog/pg_statistic.h"
  82 #include "catalog/pg_type.h"
  83 #include "mb/pg_wchar.h"
  84 #include "nodes/makefuncs.h"
  85 #include "optimizer/clauses.h"
  86 #include "optimizer/cost.h"
  87 #include "optimizer/pathnode.h"
  88 #include "optimizer/plancat.h"
  89 #include "optimizer/prep.h"
  90 #include "parser/parse_func.h"
  91 #include "parser/parse_oper.h"
  92 #include "parser/parsetree.h"
  93 #include "utils/builtins.h"
  94 #include "utils/date.h"
  95 #include "utils/datum.h"
  96 #include "utils/int8.h"
  97 #include "utils/lsyscache.h"
  98 #include "utils/selfuncs.h"
  99 #include "utils/syscache.h"
 100
 101
 102 /*
 103  * Note: the default selectivity estimates are not chosen entirely at random.
 104  * We want them to be small enough to ensure that indexscans will be used if
 105  * available, for typical table densities of ~100 tuples/page.  Thus, for
 106  * example, 0.01 is not quite small enough, since that makes it appear that
 107  * nearly all pages will be hit anyway.  Also, since we sometimes estimate
 108  * eqsel as 1/num_distinct, we probably want DEFAULT_NUM_DISTINCT to equal
 109  * 1/DEFAULT_EQ_SEL.
 110  */
 111
 112 /* default selectivity estimate for equalities such as "A = b" */
 113 #define DEFAULT_EQ_SEL  0.005
 114
 115 /* default selectivity estimate for inequalities such as "A < b" */
 116 #define DEFAULT_INEQ_SEL  (1.0 / 3.0)
 117
 118 /* default selectivity estimate for pattern-match operators such as LIKE */
 119 #define DEFAULT_MATCH_SEL       0.005
 120
 121 /* default number of distinct values in a table */
 122 #define DEFAULT_NUM_DISTINCT  200
 123
 124 /* default selectivity estimate for boolean and null test nodes */
 125 #define DEFAULT_UNK_SEL                 0.005
 126 #define DEFAULT_NOT_UNK_SEL             (1.0 - DEFAULT_UNK_SEL)
 127 #define DEFAULT_BOOL_SEL                0.5
 128
 129 /*
 130  * Clamp a computed probability estimate (which may suffer from roundoff or
 131  * estimation errors) to valid range.  Argument must be a float variable.
 132  */
 133 #define CLAMP_PROBABILITY(p) \
 134         do { \
 135                 if (p < 0.0) \
 136                         p = 0.0; \
 137                 else if (p > 1.0) \
 138                         p = 1.0; \
 139         } while (0)
 140
 141
 142 static bool get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max);
 143 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 144                                   Datum lobound, Datum hibound, Oid boundstypid,
 145                                   double *scaledlobound, double *scaledhibound);
 146 static double convert_numeric_to_scalar(Datum value, Oid typid);
 147 static void convert_string_to_scalar(unsigned char *value,
 148                                                  double *scaledvalue,
 149                                                  unsigned char *lobound,
 150                                                  double *scaledlobound,
 151                                                  unsigned char *hibound,
 152                                                  double *scaledhibound);
 153 static void convert_bytea_to_scalar(Datum value,
 154                                                 double *scaledvalue,
 155                                                 Datum lobound,
 156                                                 double *scaledlobound,
 157                                                 Datum hibound,
 158                                                 double *scaledhibound);
 159 static double convert_one_string_to_scalar(unsigned char *value,
 160                                                          int rangelo, int rangehi);
 161 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 162                                                         int rangelo, int rangehi);
 163 static unsigned char *convert_string_datum(Datum value, Oid typid);
 164 static double convert_timevalue_to_scalar(Datum value, Oid typid);
 165 static double get_att_numdistinct(Query *root, Var *var,
 166                                         Form_pg_statistic stats);
 167 static bool get_restriction_var(List *args, int varRelid,
 168                                         Var **var, Node **other,
 169                                         bool *varonleft);
 170 static void get_join_vars(List *args, Var **var1, Var **var2);
 171 static Selectivity prefix_selectivity(Query *root, Var *var, char *prefix);
 172 static Selectivity pattern_selectivity(char *patt, Pattern_Type ptype);
 173 static bool string_lessthan(const char *str1, const char *str2,
 174                                 Oid datatype);
 175 static Oid      find_operator(const char *opname, Oid datatype);
 176 static Datum string_to_datum(const char *str, Oid datatype);
 177 static Const *string_to_const(const char *str, Oid datatype);
 178
 179
 180 /*
 181  *              eqsel                   - Selectivity of "=" for any data types.
 182  *
 183  * Note: this routine is also used to estimate selectivity for some
 184  * operators that are not "=" but have comparable selectivity behavior,
 185  * such as "~=" (geometric approximate-match).  Even for "=", we must
 186  * keep in mind that the left and right datatypes may differ.
 187  */
 188 Datum
 189 eqsel(PG_FUNCTION_ARGS)
 190 {
 191         Query      *root = (Query *) PG_GETARG_POINTER(0);
 192         Oid                     operator = PG_GETARG_OID(1);
 193         List       *args = (List *) PG_GETARG_POINTER(2);
 194         int                     varRelid = PG_GETARG_INT32(3);
 195         Var                *var;
 196         Node       *other;
 197         bool            varonleft;
 198         Oid                     relid;
 199         HeapTuple       statsTuple;
 200         Datum      *values;
 201         int                     nvalues;
 202         float4     *numbers;
 203         int                     nnumbers;
 204         double          selec;
 205
 206         /*
 207          * If expression is not var = something or something = var for a
 208          * simple var of a real relation (no subqueries, for now), then punt
 209          * and return a default estimate.
 210          */
 211         if (!get_restriction_var(args, varRelid,
 212                                                          &var, &other, &varonleft))
 213                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 214         relid = getrelid(var->varno, root->rtable);
 215         if (relid == InvalidOid)
 216                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 217
 218         /*
 219          * If the something is a NULL constant, assume operator is strict and
 220          * return zero, ie, operator will never return TRUE.
 221          */
 222         if (IsA(other, Const) &&((Const *) other)->constisnull)
 223                 PG_RETURN_FLOAT8(0.0);
 224
 225         /* get stats for the attribute, if available */
 226         statsTuple = SearchSysCache(STATRELATT,
 227                                                                 ObjectIdGetDatum(relid),
 228                                                                 Int16GetDatum(var->varattno),
 229                                                                 0, 0);
 230         if (HeapTupleIsValid(statsTuple))
 231         {
 232                 Form_pg_statistic stats;
 233
 234                 stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
 235
 236                 if (IsA(other, Const))
 237                 {
 238                         /* Var is being compared to a known non-null constant */
 239                         Datum           constval = ((Const *) other)->constvalue;
 240                         bool            match = false;
 241                         int                     i;
 242
 243                         /*
 244                          * Is the constant "=" to any of the column's most common
 245                          * values?      (Although the given operator may not really be
 246                          * "=", we will assume that seeing whether it returns TRUE is
 247                          * an appropriate test.  If you don't like this, maybe you
 248                          * shouldn't be using eqsel for your operator...)
 249                          */
 250                         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 251                                                                  STATISTIC_KIND_MCV, InvalidOid,
 252                                                                  &values, &nvalues,
 253                                                                  &numbers, &nnumbers))
 254                         {
 255                                 FmgrInfo        eqproc;
 256
 257                                 fmgr_info(get_opcode(operator), &eqproc);
 258
 259                                 for (i = 0; i < nvalues; i++)
 260                                 {
 261                                         /* be careful to apply operator right way 'round */
 262                                         if (varonleft)
 263                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 264                                                                                                                    values[i],
 265                                                                                                                    constval));
 266                                         else
 267                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 268                                                                                                                    constval,
 269                                                                                                                    values[i]));
 270                                         if (match)
 271                                                 break;
 272                                 }
 273                         }
 274                         else
 275                         {
 276                                 /* no most-common-value info available */
 277                                 values = NULL;
 278                                 numbers = NULL;
 279                                 i = nvalues = nnumbers = 0;
 280                         }
 281
 282                         if (match)
 283                         {
 284                                 /*
 285                                  * Constant is "=" to this common value.  We know
 286                                  * selectivity exactly (or as exactly as VACUUM could
 287                                  * calculate it, anyway).
 288                                  */
 289                                 selec = numbers[i];
 290                         }
 291                         else
 292                         {
 293                                 /*
 294                                  * Comparison is against a constant that is neither NULL
 295                                  * nor any of the common values.  Its selectivity cannot
 296                                  * be more than this:
 297                                  */
 298                                 double          sumcommon = 0.0;
 299                                 double          otherdistinct;
 300
 301                                 for (i = 0; i < nnumbers; i++)
 302                                         sumcommon += numbers[i];
 303                                 selec = 1.0 - sumcommon - stats->stanullfrac;
 304                                 CLAMP_PROBABILITY(selec);
 305
 306                                 /*
 307                                  * and in fact it's probably a good deal less. We
 308                                  * approximate that all the not-common values share this
 309                                  * remaining fraction equally, so we divide by the number
 310                                  * of other distinct values.
 311                                  */
 312                                 otherdistinct = get_att_numdistinct(root, var, stats)
 313                                         - nnumbers;
 314                                 if (otherdistinct > 1)
 315                                         selec /= otherdistinct;
 316
 317                                 /*
 318                                  * Another cross-check: selectivity shouldn't be estimated
 319                                  * as more than the least common "most common value".
 320                                  */
 321                                 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
 322                                         selec = numbers[nnumbers - 1];
 323                         }
 324
 325                         free_attstatsslot(var->vartype, values, nvalues,
 326                                                           numbers, nnumbers);
 327                 }
 328                 else
 329                 {
 330                         double          ndistinct;
 331
 332                         /*
 333                          * Search is for a value that we do not know a priori, but we
 334                          * will assume it is not NULL.  Estimate the selectivity as
 335                          * non-null fraction divided by number of distinct values, so
 336                          * that we get a result averaged over all possible values
 337                          * whether common or uncommon.  (Essentially, we are assuming
 338                          * that the not-yet-known comparison value is equally likely
 339                          * to be any of the possible values, regardless of their
 340                          * frequency in the table.      Is that a good idea?)
 341                          */
 342                         selec = 1.0 - stats->stanullfrac;
 343                         ndistinct = get_att_numdistinct(root, var, stats);
 344                         if (ndistinct > 1)
 345                                 selec /= ndistinct;
 346
 347                         /*
 348                          * Cross-check: selectivity should never be estimated as more
 349                          * than the most common value's.
 350                          */
 351                         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 352                                                                  STATISTIC_KIND_MCV, InvalidOid,
 353                                                                  NULL, NULL,
 354                                                                  &numbers, &nnumbers))
 355                         {
 356                                 if (nnumbers > 0 && selec > numbers[0])
 357                                         selec = numbers[0];
 358                                 free_attstatsslot(var->vartype, NULL, 0, numbers, nnumbers);
 359                         }
 360                 }
 361
 362                 ReleaseSysCache(statsTuple);
 363         }
 364         else
 365         {
 366                 /*
 367                  * No VACUUM ANALYZE stats available, so make a guess using
 368                  * estimated number of distinct values and assuming they are
 369                  * equally common.      (The guess is unlikely to be very good, but we
 370                  * do know a few special cases.)
 371                  */
 372                 selec = 1.0 / get_att_numdistinct(root, var, NULL);
 373         }
 374
 375         /* result should be in range, but make sure... */
 376         CLAMP_PROBABILITY(selec);
 377
 378         PG_RETURN_FLOAT8((float8) selec);
 379 }
 380
 381 /*
 382  *              neqsel                  - Selectivity of "!=" for any data types.
 383  *
 384  * This routine is also used for some operators that are not "!="
 385  * but have comparable selectivity behavior.  See above comments
 386  * for eqsel().
 387  */
 388 Datum
 389 neqsel(PG_FUNCTION_ARGS)
 390 {
 391         Query      *root = (Query *) PG_GETARG_POINTER(0);
 392         Oid                     operator = PG_GETARG_OID(1);
 393         List       *args = (List *) PG_GETARG_POINTER(2);
 394         int                     varRelid = PG_GETARG_INT32(3);
 395         Oid                     eqop;
 396         float8          result;
 397
 398         /*
 399          * We want 1 - eqsel() where the equality operator is the one
 400          * associated with this != operator, that is, its negator.
 401          */
 402         eqop = get_negator(operator);
 403         if (eqop)
 404         {
 405                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 406                                                                                                         PointerGetDatum(root),
 407                                                                                                   ObjectIdGetDatum(eqop),
 408                                                                                                         PointerGetDatum(args),
 409                                                                                            Int32GetDatum(varRelid)));
 410         }
 411         else
 412         {
 413                 /* Use default selectivity (should we raise an error instead?) */
 414                 result = DEFAULT_EQ_SEL;
 415         }
 416         result = 1.0 - result;
 417         PG_RETURN_FLOAT8(result);
 418 }
 419
 420 /*
 421  *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
 422  *
 423  * This is the guts of both scalarltsel and scalargtsel.  The caller has
 424  * commuted the clause, if necessary, so that we can treat the Var as
 425  * being on the left.  The caller must also make sure that the other side
 426  * of the clause is a non-null Const, and dissect same into a value and
 427  * datatype.
 428  *
 429  * This routine works for any datatype (or pair of datatypes) known to
 430  * convert_to_scalar().  If it is applied to some other datatype,
 431  * it will return a default estimate.
 432  */
 433 static double
 434 scalarineqsel(Query *root, Oid operator, bool isgt,
 435                           Var *var, Datum constval, Oid consttype)
 436 {
 437         Oid                     relid;
 438         HeapTuple       statsTuple;
 439         Form_pg_statistic stats;
 440         FmgrInfo        opproc;
 441         Datum      *values;
 442         int                     nvalues;
 443         float4     *numbers;
 444         int                     nnumbers;
 445         double          mcv_selec,
 446                                 hist_selec,
 447                                 sumcommon;
 448         double          selec;
 449         int                     i;
 450
 451         /*
 452          * If expression is not var op something or something op var for a
 453          * simple var of a real relation (no subqueries, for now), then punt
 454          * and return a default estimate.
 455          */
 456         relid = getrelid(var->varno, root->rtable);
 457         if (relid == InvalidOid)
 458                 return DEFAULT_INEQ_SEL;
 459
 460         /* get stats for the attribute */
 461         statsTuple = SearchSysCache(STATRELATT,
 462                                                                 ObjectIdGetDatum(relid),
 463                                                                 Int16GetDatum(var->varattno),
 464                                                                 0, 0);
 465         if (!HeapTupleIsValid(statsTuple))
 466         {
 467                 /* no stats available, so default result */
 468                 return DEFAULT_INEQ_SEL;
 469         }
 470         stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
 471
 472         fmgr_info(get_opcode(operator), &opproc);
 473
 474         /*
 475          * If we have most-common-values info, add up the fractions of the MCV
 476          * entries that satisfy MCV OP CONST.  These fractions contribute
 477          * directly to the result selectivity.  Also add up the total fraction
 478          * represented by MCV entries.
 479          */
 480         mcv_selec = 0.0;
 481         sumcommon = 0.0;
 482
 483         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 484                                                  STATISTIC_KIND_MCV, InvalidOid,
 485                                                  &values, &nvalues,
 486                                                  &numbers, &nnumbers))
 487         {
 488                 for (i = 0; i < nvalues; i++)
 489                 {
 490                         if (DatumGetBool(FunctionCall2(&opproc,
 491                                                                                    values[i],
 492                                                                                    constval)))
 493                                 mcv_selec += numbers[i];
 494                         sumcommon += numbers[i];
 495                 }
 496                 free_attstatsslot(var->vartype, values, nvalues, numbers, nnumbers);
 497         }
 498
 499         /*
 500          * If there is a histogram, determine which bin the constant falls in,
 501          * and compute the resulting contribution to selectivity.
 502          *
 503          * Someday, VACUUM might store more than one histogram per rel/att,
 504          * corresponding to more than one possible sort ordering defined for
 505          * the column type.  However, to make that work we will need to figure
 506          * out which staop to search for --- it's not necessarily the one we
 507          * have at hand!  (For example, we might have a '<=' operator rather
 508          * than the '<' operator that will appear in staop.)  For now, assume
 509          * that whatever appears in pg_statistic is sorted the same way our
 510          * operator sorts, or the reverse way if isgt is TRUE.
 511          */
 512         hist_selec = 0.0;
 513
 514         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 515                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 516                                                  &values, &nvalues,
 517                                                  NULL, NULL))
 518         {
 519                 if (nvalues > 1)
 520                 {
 521                         double          histfrac;
 522                         bool            ltcmp;
 523
 524                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 525                                                                                            values[0],
 526                                                                                            constval));
 527                         if (isgt)
 528                                 ltcmp = !ltcmp;
 529                         if (!ltcmp)
 530                         {
 531                                 /* Constant is below lower histogram boundary. */
 532                                 histfrac = 0.0;
 533                         }
 534                         else
 535                         {
 536                                 /*
 537                                  * Scan to find proper location.  This could be made
 538                                  * faster by using a binary-search method, but it's
 539                                  * probably not worth the trouble for typical histogram
 540                                  * sizes.
 541                                  */
 542                                 for (i = 1; i < nvalues; i++)
 543                                 {
 544                                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 545                                                                                                            values[i],
 546                                                                                                            constval));
 547                                         if (isgt)
 548                                                 ltcmp = !ltcmp;
 549                                         if (!ltcmp)
 550                                                 break;
 551                                 }
 552                                 if (i >= nvalues)
 553                                 {
 554                                         /* Constant is above upper histogram boundary. */
 555                                         histfrac = 1.0;
 556                                 }
 557                                 else
 558                                 {
 559                                         double          val,
 560                                                                 high,
 561                                                                 low;
 562                                         double          binfrac;
 563
 564                                         /*
 565                                          * We have values[i-1] < constant < values[i].
 566                                          *
 567                                          * Convert the constant and the two nearest bin boundary
 568                                          * values to a uniform comparison scale, and do a
 569                                          * linear interpolation within this bin.
 570                                          */
 571                                         if (convert_to_scalar(constval, consttype, &val,
 572                                                                                   values[i - 1], values[i],
 573                                                                                   var->vartype,
 574                                                                                   &low, &high))
 575                                         {
 576                                                 if (high <= low)
 577                                                 {
 578                                                         /* cope if bin boundaries appear identical */
 579                                                         binfrac = 0.5;
 580                                                 }
 581                                                 else if (val <= low)
 582                                                         binfrac = 0.0;
 583                                                 else if (val >= high)
 584                                                         binfrac = 1.0;
 585                                                 else
 586                                                 {
 587                                                         binfrac = (val - low) / (high - low);
 588
 589                                                         /*
 590                                                          * Watch out for the possibility that we got a
 591                                                          * NaN or Infinity from the division.  This
 592                                                          * can happen despite the previous checks, if
 593                                                          * for example "low" is -Infinity.
 594                                                          */
 595                                                         if (isnan(binfrac) ||
 596                                                                 binfrac < 0.0 || binfrac > 1.0)
 597                                                                 binfrac = 0.5;
 598                                                 }
 599                                         }
 600                                         else
 601                                         {
 602                                                 /*
 603                                                  * Ideally we'd produce an error here, on the
 604                                                  * grounds that the given operator shouldn't have
 605                                                  * scalarXXsel registered as its selectivity func
 606                                                  * unless we can deal with its operand types.  But
 607                                                  * currently, all manner of stuff is invoking
 608                                                  * scalarXXsel, so give a default estimate until
 609                                                  * that can be fixed.
 610                                                  */
 611                                                 binfrac = 0.5;
 612                                         }
 613
 614                                         /*
 615                                          * Now, compute the overall selectivity across the
 616                                          * values represented by the histogram.  We have i-1
 617                                          * full bins and binfrac partial bin below the
 618                                          * constant.
 619                                          */
 620                                         histfrac = (double) (i - 1) + binfrac;
 621                                         histfrac /= (double) (nvalues - 1);
 622                                 }
 623                         }
 624
 625                         /*
 626                          * Now histfrac = fraction of histogram entries below the
 627                          * constant.
 628                          *
 629                          * Account for "<" vs ">"
 630                          */
 631                         hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 632
 633                         /*
 634                          * The histogram boundaries are only approximate to begin
 635                          * with, and may well be out of date anyway.  Therefore, don't
 636                          * believe extremely small or large selectivity estimates.
 637                          */
 638                         if (hist_selec < 0.0001)
 639                                 hist_selec = 0.0001;
 640                         else if (hist_selec > 0.9999)
 641                                 hist_selec = 0.9999;
 642                 }
 643
 644                 free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
 645         }
 646
 647         /*
 648          * Now merge the results from the MCV and histogram calculations,
 649          * realizing that the histogram covers only the non-null values that
 650          * are not listed in MCV.
 651          */
 652         selec = 1.0 - stats->stanullfrac - sumcommon;
 653
 654         if (hist_selec > 0.0)
 655                 selec *= hist_selec;
 656         else
 657         {
 658                 /*
 659                  * If no histogram but there are values not accounted for by MCV,
 660                  * arbitrarily assume half of them will match.
 661                  */
 662                 selec *= 0.5;
 663         }
 664
 665         selec += mcv_selec;
 666
 667         ReleaseSysCache(statsTuple);
 668
 669         /* result should be in range, but make sure... */
 670         CLAMP_PROBABILITY(selec);
 671
 672         return selec;
 673 }
 674
 675 /*
 676  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 677  */
 678 Datum
 679 scalarltsel(PG_FUNCTION_ARGS)
 680 {
 681         Query      *root = (Query *) PG_GETARG_POINTER(0);
 682         Oid                     operator = PG_GETARG_OID(1);
 683         List       *args = (List *) PG_GETARG_POINTER(2);
 684         int                     varRelid = PG_GETARG_INT32(3);
 685         Var                *var;
 686         Node       *other;
 687         Datum           constval;
 688         Oid                     consttype;
 689         bool            varonleft;
 690         bool            isgt;
 691         double          selec;
 692
 693         /*
 694          * If expression is not var op something or something op var for a
 695          * simple var of a real relation (no subqueries, for now), then punt
 696          * and return a default estimate.
 697          */
 698         if (!get_restriction_var(args, varRelid,
 699                                                          &var, &other, &varonleft))
 700                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 701
 702         /*
 703          * Can't do anything useful if the something is not a constant,
 704          * either.
 705          */
 706         if (!IsA(other, Const))
 707                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 708
 709         /*
 710          * If the constant is NULL, assume operator is strict and return zero,
 711          * ie, operator will never return TRUE.
 712          */
 713         if (((Const *) other)->constisnull)
 714                 PG_RETURN_FLOAT8(0.0);
 715         constval = ((Const *) other)->constvalue;
 716         consttype = ((Const *) other)->consttype;
 717
 718         /*
 719          * Force the var to be on the left to simplify logic in scalarineqsel.
 720          */
 721         if (varonleft)
 722         {
 723                 /* we have var < other */
 724                 isgt = false;
 725         }
 726         else
 727         {
 728                 /* we have other < var, commute to make var > other */
 729                 operator = get_commutator(operator);
 730                 if (!operator)
 731                 {
 732                         /* Use default selectivity (should we raise an error instead?) */
 733                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 734                 }
 735                 isgt = true;
 736         }
 737
 738         selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
 739
 740         PG_RETURN_FLOAT8((float8) selec);
 741 }
 742
 743 /*
 744  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 745  */
 746 Datum
 747 scalargtsel(PG_FUNCTION_ARGS)
 748 {
 749         Query      *root = (Query *) PG_GETARG_POINTER(0);
 750         Oid                     operator = PG_GETARG_OID(1);
 751         List       *args = (List *) PG_GETARG_POINTER(2);
 752         int                     varRelid = PG_GETARG_INT32(3);
 753         Var                *var;
 754         Node       *other;
 755         Datum           constval;
 756         Oid                     consttype;
 757         bool            varonleft;
 758         bool            isgt;
 759         double          selec;
 760
 761         /*
 762          * If expression is not var op something or something op var for a
 763          * simple var of a real relation (no subqueries, for now), then punt
 764          * and return a default estimate.
 765          */
 766         if (!get_restriction_var(args, varRelid,
 767                                                          &var, &other, &varonleft))
 768                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 769
 770         /*
 771          * Can't do anything useful if the something is not a constant,
 772          * either.
 773          */
 774         if (!IsA(other, Const))
 775                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 776
 777         /*
 778          * If the constant is NULL, assume operator is strict and return zero,
 779          * ie, operator will never return TRUE.
 780          */
 781         if (((Const *) other)->constisnull)
 782                 PG_RETURN_FLOAT8(0.0);
 783         constval = ((Const *) other)->constvalue;
 784         consttype = ((Const *) other)->consttype;
 785
 786         /*
 787          * Force the var to be on the left to simplify logic in scalarineqsel.
 788          */
 789         if (varonleft)
 790         {
 791                 /* we have var > other */
 792                 isgt = true;
 793         }
 794         else
 795         {
 796                 /* we have other > var, commute to make var < other */
 797                 operator = get_commutator(operator);
 798                 if (!operator)
 799                 {
 800                         /* Use default selectivity (should we raise an error instead?) */
 801                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 802                 }
 803                 isgt = false;
 804         }
 805
 806         selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
 807
 808         PG_RETURN_FLOAT8((float8) selec);
 809 }
 810
 811 /*
 812  * patternsel                   - Generic code for pattern-match selectivity.
 813  */
 814 static double
 815 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 816 {
 817         Query      *root = (Query *) PG_GETARG_POINTER(0);
 818
 819 #ifdef NOT_USED
 820         Oid                     operator = PG_GETARG_OID(1);
 821 #endif
 822         List       *args = (List *) PG_GETARG_POINTER(2);
 823         int                     varRelid = PG_GETARG_INT32(3);
 824         Var                *var;
 825         Node       *other;
 826         bool            varonleft;
 827         Oid                     relid;
 828         Datum           constval;
 829         char       *patt;
 830         Pattern_Prefix_Status pstatus;
 831         char       *prefix;
 832         char       *rest;
 833         double          result;
 834
 835         /*
 836          * If expression is not var op constant for a simple var of a real
 837          * relation (no subqueries, for now), then punt and return a default
 838          * estimate.
 839          */
 840         if (!get_restriction_var(args, varRelid,
 841                                                          &var, &other, &varonleft))
 842                 return DEFAULT_MATCH_SEL;
 843         if (!varonleft || !IsA(other, Const))
 844                 return DEFAULT_MATCH_SEL;
 845         relid = getrelid(var->varno, root->rtable);
 846         if (relid == InvalidOid)
 847                 return DEFAULT_MATCH_SEL;
 848
 849         /*
 850          * If the constant is NULL, assume operator is strict and return zero,
 851          * ie, operator will never return TRUE.
 852          */
 853         if (((Const *) other)->constisnull)
 854                 return 0.0;
 855         constval = ((Const *) other)->constvalue;
 856         /* the right-hand const is type text for all supported operators */
 857         Assert(((Const *) other)->consttype == TEXTOID);
 858         patt = DatumGetCString(DirectFunctionCall1(textout, constval));
 859
 860         /* divide pattern into fixed prefix and remainder */
 861         pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
 862
 863         if (pstatus == Pattern_Prefix_Exact)
 864         {
 865                 /*
 866                  * Pattern specifies an exact match, so pretend operator is '='
 867                  */
 868                 Oid                     eqopr = find_operator("=", var->vartype);
 869                 Const      *eqcon;
 870                 List       *eqargs;
 871
 872                 if (eqopr == InvalidOid)
 873                         elog(ERROR, "patternsel: no = operator for type %u",
 874                                  var->vartype);
 875                 eqcon = string_to_const(prefix, var->vartype);
 876                 eqargs = makeList2(var, eqcon);
 877                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 878                                                                                                         PointerGetDatum(root),
 879                                                                                                  ObjectIdGetDatum(eqopr),
 880                                                                                                  PointerGetDatum(eqargs),
 881                                                                                            Int32GetDatum(varRelid)));
 882         }
 883         else
 884         {
 885                 /*
 886                  * Not exact-match pattern.  We estimate selectivity of the fixed
 887                  * prefix and remainder of pattern separately, then combine the
 888                  * two.
 889                  */
 890                 Selectivity prefixsel;
 891                 Selectivity restsel;
 892                 Selectivity selec;
 893
 894                 if (pstatus == Pattern_Prefix_Partial)
 895                         prefixsel = prefix_selectivity(root, var, prefix);
 896                 else
 897                         prefixsel = 1.0;
 898                 restsel = pattern_selectivity(rest, ptype);
 899                 selec = prefixsel * restsel;
 900                 /* result should be in range, but make sure... */
 901                 CLAMP_PROBABILITY(selec);
 902                 result = selec;
 903         }
 904
 905         if (prefix)
 906                 pfree(prefix);
 907         pfree(patt);
 908
 909         return result;
 910 }
 911
 912 /*
 913  *              regexeqsel              - Selectivity of regular-expression pattern match.
 914  */
 915 Datum
 916 regexeqsel(PG_FUNCTION_ARGS)
 917 {
 918         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
 919 }
 920
 921 /*
 922  *              icregexeqsel    - Selectivity of case-insensitive regex match.
 923  */
 924 Datum
 925 icregexeqsel(PG_FUNCTION_ARGS)
 926 {
 927         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
 928 }
 929
 930 /*
 931  *              likesel                 - Selectivity of LIKE pattern match.
 932  */
 933 Datum
 934 likesel(PG_FUNCTION_ARGS)
 935 {
 936         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
 937 }
 938
 939 /*
 940  *              iclikesel                       - Selectivity of ILIKE pattern match.
 941  */
 942 Datum
 943 iclikesel(PG_FUNCTION_ARGS)
 944 {
 945         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
 946 }
 947
 948 /*
 949  *              regexnesel              - Selectivity of regular-expression pattern non-match.
 950  */
 951 Datum
 952 regexnesel(PG_FUNCTION_ARGS)
 953 {
 954         double          result;
 955
 956         result = patternsel(fcinfo, Pattern_Type_Regex);
 957         result = 1.0 - result;
 958         PG_RETURN_FLOAT8(result);
 959 }
 960
 961 /*
 962  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
 963  */
 964 Datum
 965 icregexnesel(PG_FUNCTION_ARGS)
 966 {
 967         double          result;
 968
 969         result = patternsel(fcinfo, Pattern_Type_Regex_IC);
 970         result = 1.0 - result;
 971         PG_RETURN_FLOAT8(result);
 972 }
 973
 974 /*
 975  *              nlikesel                - Selectivity of LIKE pattern non-match.
 976  */
 977 Datum
 978 nlikesel(PG_FUNCTION_ARGS)
 979 {
 980         double          result;
 981
 982         result = patternsel(fcinfo, Pattern_Type_Like);
 983         result = 1.0 - result;
 984         PG_RETURN_FLOAT8(result);
 985 }
 986
 987 /*
 988  *              icnlikesel              - Selectivity of ILIKE pattern non-match.
 989  */
 990 Datum
 991 icnlikesel(PG_FUNCTION_ARGS)
 992 {
 993         double          result;
 994
 995         result = patternsel(fcinfo, Pattern_Type_Like_IC);
 996         result = 1.0 - result;
 997         PG_RETURN_FLOAT8(result);
 998 }
 999
1000 /*
1001  *              booltestsel             - Selectivity of BooleanTest Node.
1002  */
1003 Selectivity
1004 booltestsel(Query *root, BooleanTest *clause, int varRelid)
1005 {
1006         Var                *var;
1007         Node       *arg;
1008         Oid                     relid;
1009         HeapTuple       statsTuple;
1010         Datum      *values;
1011         int                     nvalues;
1012         float4     *numbers;
1013         int                     nnumbers;
1014         double          selec;
1015
1016         Assert(clause && IsA(clause, BooleanTest));
1017
1018         arg = (Node *) clause->arg;
1019
1020         /*
1021          * Ignore any binary-compatible relabeling (probably unnecessary, but
1022          * can't hurt)
1023          */
1024         if (IsA(arg, RelabelType))
1025                 arg = ((RelabelType *) arg)->arg;
1026
1027         if (IsA(arg, Var) &&(varRelid == 0 || varRelid == ((Var *) arg)->varno))
1028                 var = (Var *) arg;
1029         else
1030         {
1031                 /*
1032                  * If argument is not a Var, we can't get statistics for it, but
1033                  * perhaps clause_selectivity can do something with it.  We ignore
1034                  * the possibility of a NULL value when using clause_selectivity,
1035                  * and just assume the value is either TRUE or FALSE.
1036                  */
1037                 switch (clause->booltesttype)
1038                 {
1039                         case IS_UNKNOWN:
1040                                 selec = DEFAULT_UNK_SEL;
1041                                 break;
1042                         case IS_NOT_UNKNOWN:
1043                                 selec = DEFAULT_NOT_UNK_SEL;
1044                                 break;
1045                         case IS_TRUE:
1046                         case IS_NOT_FALSE:
1047                                 selec = (double) clause_selectivity(root, arg, varRelid);
1048                                 break;
1049                         case IS_FALSE:
1050                         case IS_NOT_TRUE:
1051                                 selec = 1.0 - (double) clause_selectivity(root, arg, varRelid);
1052                                 break;
1053                         default:
1054                                 elog(ERROR, "booltestsel: unexpected booltesttype %d",
1055                                          (int) clause->booltesttype);
1056                                 selec = 0.0;    /* Keep compiler quiet */
1057                                 break;
1058                 }
1059                 return (Selectivity) selec;
1060         }
1061
1062         /* get stats for the attribute, if available */
1063         relid = getrelid(var->varno, root->rtable);
1064         if (relid == InvalidOid)
1065                 statsTuple = NULL;
1066         else
1067                 statsTuple = SearchSysCache(STATRELATT,
1068                                                                         ObjectIdGetDatum(relid),
1069                                                                         Int16GetDatum(var->varattno),
1070                                                                         0, 0);
1071
1072         if (HeapTupleIsValid(statsTuple))
1073         {
1074                 Form_pg_statistic stats;
1075                 double          freq_null;
1076
1077                 stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
1078
1079                 freq_null = stats->stanullfrac;
1080
1081                 if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
1082                                                          STATISTIC_KIND_MCV, InvalidOid,
1083                                                          &values, &nvalues,
1084                                                          &numbers, &nnumbers)
1085                         && nnumbers > 0)
1086                 {
1087                         double          freq_true;
1088                         double          freq_false;
1089
1090                         /*
1091                          * Get first MCV frequency and derive frequency for true.
1092                          */
1093                         if (DatumGetBool(values[0]))
1094                                 freq_true = numbers[0];
1095                         else
1096                                 freq_true = 1.0 - numbers[0] - freq_null;
1097
1098                         /*
1099                          * Next derive freqency for false. Then use these as
1100                          * appropriate to derive frequency for each case.
1101                          */
1102                         freq_false = 1.0 - freq_true - freq_null;
1103
1104                         switch (clause->booltesttype)
1105                         {
1106                                 case IS_UNKNOWN:
1107                                         /* select only NULL values */
1108                                         selec = freq_null;
1109                                         break;
1110                                 case IS_NOT_UNKNOWN:
1111                                         /* select non-NULL values */
1112                                         selec = 1.0 - freq_null;
1113                                         break;
1114                                 case IS_TRUE:
1115                                         /* select only TRUE values */
1116                                         selec = freq_true;
1117                                         break;
1118                                 case IS_NOT_TRUE:
1119                                         /* select non-TRUE values */
1120                                         selec = 1.0 - freq_true;
1121                                         break;
1122                                 case IS_FALSE:
1123                                         /* select only FALSE values */
1124                                         selec = freq_false;
1125                                         break;
1126                                 case IS_NOT_FALSE:
1127                                         /* select non-FALSE values */
1128                                         selec = 1.0 - freq_false;
1129                                         break;
1130                                 default:
1131                                         elog(ERROR, "booltestsel: unexpected booltesttype %d",
1132                                                  (int) clause->booltesttype);
1133                                         selec = 0.0;    /* Keep compiler quiet */
1134                                         break;
1135                         }
1136
1137                         free_attstatsslot(var->vartype, values, nvalues,
1138                                                           numbers, nnumbers);
1139                 }
1140                 else
1141                 {
1142                         /*
1143                          * No most-common-value info available. Still have null
1144                          * fraction information, so use it for IS [NOT] UNKNOWN.
1145                          * Otherwise adjust for null fraction and assume an even split
1146                          * for boolean tests.
1147                          */
1148                         switch (clause->booltesttype)
1149                         {
1150                                 case IS_UNKNOWN:
1151
1152                                         /*
1153                                          * Use freq_null directly.
1154                                          */
1155                                         selec = freq_null;
1156                                         break;
1157                                 case IS_NOT_UNKNOWN:
1158
1159                                         /*
1160                                          * Select not unknown (not null) values. Calculate
1161                                          * from freq_null.
1162                                          */
1163                                         selec = 1.0 - freq_null;
1164                                         break;
1165                                 case IS_TRUE:
1166                                 case IS_NOT_TRUE:
1167                                 case IS_FALSE:
1168                                 case IS_NOT_FALSE:
1169                                         selec = (1.0 - freq_null) / 2.0;
1170                                         break;
1171                                 default:
1172                                         elog(ERROR, "booltestsel: unexpected booltesttype %d",
1173                                                  (int) clause->booltesttype);
1174                                         selec = 0.0;    /* Keep compiler quiet */
1175                                         break;
1176                         }
1177                 }
1178
1179                 ReleaseSysCache(statsTuple);
1180         }
1181         else
1182         {
1183                 /*
1184                  * No VACUUM ANALYZE stats available, so use a default value.
1185                  * (Note: not much point in recursing to clause_selectivity here.)
1186                  */
1187                 switch (clause->booltesttype)
1188                 {
1189                         case IS_UNKNOWN:
1190                                 selec = DEFAULT_UNK_SEL;
1191                                 break;
1192                         case IS_NOT_UNKNOWN:
1193                                 selec = DEFAULT_NOT_UNK_SEL;
1194                                 break;
1195                         case IS_TRUE:
1196                         case IS_NOT_TRUE:
1197                         case IS_FALSE:
1198                         case IS_NOT_FALSE:
1199                                 selec = DEFAULT_BOOL_SEL;
1200                                 break;
1201                         default:
1202                                 elog(ERROR, "booltestsel: unexpected booltesttype %d",
1203                                          (int) clause->booltesttype);
1204                                 selec = 0.0;    /* Keep compiler quiet */
1205                                 break;
1206                 }
1207         }
1208
1209         /* result should be in range, but make sure... */
1210         CLAMP_PROBABILITY(selec);
1211
1212         return (Selectivity) selec;
1213 }
1214
1215 /*
1216  *              nulltestsel             - Selectivity of NullTest Node.
1217  */
1218 Selectivity
1219 nulltestsel(Query *root, NullTest *clause, int varRelid)
1220 {
1221         Var                *var;
1222         Node       *arg;
1223         Oid                     relid;
1224         HeapTuple       statsTuple;
1225         double          selec;
1226         double          defselec;
1227         double          freq_null;
1228
1229         Assert(clause && IsA(clause, NullTest));
1230
1231         switch (clause->nulltesttype)
1232         {
1233                 case IS_NULL:
1234                         defselec = DEFAULT_UNK_SEL;
1235                         break;
1236                 case IS_NOT_NULL:
1237                         defselec = DEFAULT_NOT_UNK_SEL;
1238                         break;
1239                 default:
1240                         elog(ERROR, "nulltestsel: unexpected nulltesttype %d",
1241                                  (int) clause->nulltesttype);
1242                         return (Selectivity) 0;         /* keep compiler quiet */
1243         }
1244
1245         arg = (Node *) clause->arg;
1246
1247         /*
1248          * Ignore any binary-compatible relabeling
1249          */
1250         if (IsA(arg, RelabelType))
1251                 arg = ((RelabelType *) arg)->arg;
1252
1253         if (IsA(arg, Var) &&(varRelid == 0 || varRelid == ((Var *) arg)->varno))
1254                 var = (Var *) arg;
1255         else
1256         {
1257                 /*
1258                  * punt if non-Var argument
1259                  */
1260                 return (Selectivity) defselec;
1261         }
1262
1263         relid = getrelid(var->varno, root->rtable);
1264         if (relid == InvalidOid)
1265                 return (Selectivity) defselec;
1266
1267         /* get stats for the attribute, if available */
1268         statsTuple = SearchSysCache(STATRELATT,
1269                                                                 ObjectIdGetDatum(relid),
1270                                                                 Int16GetDatum(var->varattno),
1271                                                                 0, 0);
1272         if (HeapTupleIsValid(statsTuple))
1273         {
1274                 Form_pg_statistic stats;
1275
1276                 stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
1277                 freq_null = stats->stanullfrac;
1278
1279                 switch (clause->nulltesttype)
1280                 {
1281                         case IS_NULL:
1282
1283                                 /*
1284                                  * Use freq_null directly.
1285                                  */
1286                                 selec = freq_null;
1287                                 break;
1288                         case IS_NOT_NULL:
1289
1290                                 /*
1291                                  * Select not unknown (not null) values. Calculate from
1292                                  * freq_null.
1293                                  */
1294                                 selec = 1.0 - freq_null;
1295                                 break;
1296                         default:
1297                                 elog(ERROR, "nulltestsel: unexpected nulltesttype %d",
1298                                          (int) clause->nulltesttype);
1299                                 return (Selectivity) 0; /* keep compiler quiet */
1300                 }
1301
1302                 ReleaseSysCache(statsTuple);
1303         }
1304         else
1305         {
1306                 /*
1307                  * No VACUUM ANALYZE stats available, so make a guess
1308                  */
1309                 selec = defselec;
1310         }
1311
1312         /* result should be in range, but make sure... */
1313         CLAMP_PROBABILITY(selec);
1314
1315         return (Selectivity) selec;
1316 }
1317
1318 /*
1319  *              eqjoinsel               - Join selectivity of "="
1320  */
1321 Datum
1322 eqjoinsel(PG_FUNCTION_ARGS)
1323 {
1324         Query      *root = (Query *) PG_GETARG_POINTER(0);
1325         Oid                     operator = PG_GETARG_OID(1);
1326         List       *args = (List *) PG_GETARG_POINTER(2);
1327         Var                *var1;
1328         Var                *var2;
1329         double          selec;
1330
1331         get_join_vars(args, &var1, &var2);
1332
1333         if (var1 == NULL && var2 == NULL)
1334                 selec = DEFAULT_EQ_SEL;
1335         else
1336         {
1337                 HeapTuple       statsTuple1 = NULL;
1338                 HeapTuple       statsTuple2 = NULL;
1339                 Form_pg_statistic stats1 = NULL;
1340                 Form_pg_statistic stats2 = NULL;
1341                 double          nd1 = DEFAULT_NUM_DISTINCT;
1342                 double          nd2 = DEFAULT_NUM_DISTINCT;
1343                 bool            have_mcvs1 = false;
1344                 Datum      *values1 = NULL;
1345                 int                     nvalues1 = 0;
1346                 float4     *numbers1 = NULL;
1347                 int                     nnumbers1 = 0;
1348                 bool            have_mcvs2 = false;
1349                 Datum      *values2 = NULL;
1350                 int                     nvalues2 = 0;
1351                 float4     *numbers2 = NULL;
1352                 int                     nnumbers2 = 0;
1353
1354                 if (var1 != NULL)
1355                 {
1356                         /* get stats for the attribute, if available */
1357                         Oid                     relid1 = getrelid(var1->varno, root->rtable);
1358
1359                         if (relid1 != InvalidOid)
1360                         {
1361                                 statsTuple1 = SearchSysCache(STATRELATT,
1362                                                                                          ObjectIdGetDatum(relid1),
1363                                                                                    Int16GetDatum(var1->varattno),
1364                                                                                          0, 0);
1365                                 if (HeapTupleIsValid(statsTuple1))
1366                                 {
1367                                         stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
1368                                         have_mcvs1 = get_attstatsslot(statsTuple1,
1369                                                                                                   var1->vartype,
1370                                                                                                   var1->vartypmod,
1371                                                                                                   STATISTIC_KIND_MCV,
1372                                                                                                   InvalidOid,
1373                                                                                                   &values1, &nvalues1,
1374                                                                                                   &numbers1, &nnumbers1);
1375                                 }
1376
1377                                 nd1 = get_att_numdistinct(root, var1, stats1);
1378                         }
1379                 }
1380
1381                 if (var2 != NULL)
1382                 {
1383                         /* get stats for the attribute, if available */
1384                         Oid                     relid2 = getrelid(var2->varno, root->rtable);
1385
1386                         if (relid2 != InvalidOid)
1387                         {
1388                                 statsTuple2 = SearchSysCache(STATRELATT,
1389                                                                                          ObjectIdGetDatum(relid2),
1390                                                                                    Int16GetDatum(var2->varattno),
1391                                                                                          0, 0);
1392                                 if (HeapTupleIsValid(statsTuple2))
1393                                 {
1394                                         stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
1395                                         have_mcvs2 = get_attstatsslot(statsTuple2,
1396                                                                                                   var2->vartype,
1397                                                                                                   var2->vartypmod,
1398                                                                                                   STATISTIC_KIND_MCV,
1399                                                                                                   InvalidOid,
1400                                                                                                   &values2, &nvalues2,
1401                                                                                                   &numbers2, &nnumbers2);
1402                                 }
1403
1404                                 nd2 = get_att_numdistinct(root, var2, stats2);
1405                         }
1406                 }
1407
1408                 if (have_mcvs1 && have_mcvs2)
1409                 {
1410                         /*
1411                          * We have most-common-value lists for both relations.  Run
1412                          * through the lists to see which MCVs actually join to each
1413                          * other with the given operator.  This allows us to determine
1414                          * the exact join selectivity for the portion of the relations
1415                          * represented by the MCV lists.  We still have to estimate
1416                          * for the remaining population, but in a skewed distribution
1417                          * this gives us a big leg up in accuracy.      For motivation see
1418                          * the analysis in Y. Ioannidis and S. Christodoulakis, "On
1419                          * the propagation of errors in the size of join results",
1420                          * Technical Report 1018, Computer Science Dept., University
1421                          * of Wisconsin, Madison, March 1991 (available from
1422                          * ftp.cs.wisc.edu).
1423                          */
1424                         FmgrInfo        eqproc;
1425                         bool       *hasmatch1;
1426                         bool       *hasmatch2;
1427                         double          matchprodfreq,
1428                                                 matchfreq1,
1429                                                 matchfreq2,
1430                                                 unmatchfreq1,
1431                                                 unmatchfreq2,
1432                                                 otherfreq1,
1433                                                 otherfreq2,
1434                                                 totalsel1,
1435                                                 totalsel2;
1436                         int                     i,
1437                                                 nmatches;
1438
1439                         fmgr_info(get_opcode(operator), &eqproc);
1440                         hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
1441                         memset(hasmatch1, 0, nvalues1 * sizeof(bool));
1442                         hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
1443                         memset(hasmatch2, 0, nvalues2 * sizeof(bool));
1444
1445                         /*
1446                          * Note we assume that each MCV will match at most one member
1447                          * of the other MCV list.  If the operator isn't really
1448                          * equality, there could be multiple matches --- but we don't
1449                          * look for them, both for speed and because the math wouldn't
1450                          * add up...
1451                          */
1452                         matchprodfreq = 0.0;
1453                         nmatches = 0;
1454                         for (i = 0; i < nvalues1; i++)
1455                         {
1456                                 int                     j;
1457
1458                                 for (j = 0; j < nvalues2; j++)
1459                                 {
1460                                         if (hasmatch2[j])
1461                                                 continue;
1462                                         if (DatumGetBool(FunctionCall2(&eqproc,
1463                                                                                                    values1[i],
1464                                                                                                    values2[j])))
1465                                         {
1466                                                 hasmatch1[i] = hasmatch2[j] = true;
1467                                                 matchprodfreq += numbers1[i] * numbers2[j];
1468                                                 nmatches++;
1469                                                 break;
1470                                         }
1471                                 }
1472                         }
1473                         CLAMP_PROBABILITY(matchprodfreq);
1474                         /* Sum up frequencies of matched and unmatched MCVs */
1475                         matchfreq1 = unmatchfreq1 = 0.0;
1476                         for (i = 0; i < nvalues1; i++)
1477                         {
1478                                 if (hasmatch1[i])
1479                                         matchfreq1 += numbers1[i];
1480                                 else
1481                                         unmatchfreq1 += numbers1[i];
1482                         }
1483                         CLAMP_PROBABILITY(matchfreq1);
1484                         CLAMP_PROBABILITY(unmatchfreq1);
1485                         matchfreq2 = unmatchfreq2 = 0.0;
1486                         for (i = 0; i < nvalues2; i++)
1487                         {
1488                                 if (hasmatch2[i])
1489                                         matchfreq2 += numbers2[i];
1490                                 else
1491                                         unmatchfreq2 += numbers2[i];
1492                         }
1493                         CLAMP_PROBABILITY(matchfreq2);
1494                         CLAMP_PROBABILITY(unmatchfreq2);
1495                         pfree(hasmatch1);
1496                         pfree(hasmatch2);
1497
1498                         /*
1499                          * Compute total frequency of non-null values that are not in
1500                          * the MCV lists.
1501                          */
1502                         otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
1503                         otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
1504                         CLAMP_PROBABILITY(otherfreq1);
1505                         CLAMP_PROBABILITY(otherfreq2);
1506
1507                         /*
1508                          * We can estimate the total selectivity from the point of
1509                          * view of relation 1 as: the known selectivity for matched
1510                          * MCVs, plus unmatched MCVs that are assumed to match against
1511                          * random members of relation 2's non-MCV population, plus
1512                          * non-MCV values that are assumed to match against random
1513                          * members of relation 2's unmatched MCVs plus non-MCV values.
1514                          */
1515                         totalsel1 = matchprodfreq;
1516                         if (nd2 > nvalues2)
1517                                 totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1518                         if (nd2 > nmatches)
1519                                 totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1520                                         (nd2 - nmatches);
1521                         /* Same estimate from the point of view of relation 2. */
1522                         totalsel2 = matchprodfreq;
1523                         if (nd1 > nvalues1)
1524                                 totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1525                         if (nd1 > nmatches)
1526                                 totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1527                                         (nd1 - nmatches);
1528
1529                         /*
1530                          * Use the smaller of the two estimates.  This can be
1531                          * justified in essentially the same terms as given below for
1532                          * the no-stats case: to a first approximation, we are
1533                          * estimating from the point of view of the relation with
1534                          * smaller nd.
1535                          */
1536                         selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1537                 }
1538                 else
1539                 {
1540                         /*
1541                          * We do not have MCV lists for both sides.  Estimate the join
1542                          * selectivity as MIN(1/nd1, 1/nd2).  This is plausible if we
1543                          * assume that the values are about equally distributed: a
1544                          * given tuple of rel1 will join to either 0 or N2/nd2 rows of
1545                          * rel2, so total join rows are at most N1*N2/nd2 giving a
1546                          * join selectivity of not more than 1/nd2.  By the same logic
1547                          * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
1548                          * bound.  Using the MIN() means we estimate from the point of
1549                          * view of the relation with smaller nd (since the larger nd
1550                          * is determining the MIN).  It is reasonable to assume that
1551                          * most tuples in this rel will have join partners, so the
1552                          * bound is probably reasonably tight and should be taken
1553                          * as-is.
1554                          *
1555                          * XXX Can we be smarter if we have an MCV list for just one
1556                          * side? It seems that if we assume equal distribution for the
1557                          * other side, we end up with the same answer anyway.
1558                          */
1559                         if (nd1 > nd2)
1560                                 selec = 1.0 / nd1;
1561                         else
1562                                 selec = 1.0 / nd2;
1563                 }
1564
1565                 if (have_mcvs1)
1566                         free_attstatsslot(var1->vartype, values1, nvalues1,
1567                                                           numbers1, nnumbers1);
1568                 if (have_mcvs2)
1569                         free_attstatsslot(var2->vartype, values2, nvalues2,
1570                                                           numbers2, nnumbers2);
1571                 if (HeapTupleIsValid(statsTuple1))
1572                         ReleaseSysCache(statsTuple1);
1573                 if (HeapTupleIsValid(statsTuple2))
1574                         ReleaseSysCache(statsTuple2);
1575         }
1576
1577         CLAMP_PROBABILITY(selec);
1578
1579         PG_RETURN_FLOAT8((float8) selec);
1580 }
1581
1582 /*
1583  *              neqjoinsel              - Join selectivity of "!="
1584  */
1585 Datum
1586 neqjoinsel(PG_FUNCTION_ARGS)
1587 {
1588         Query      *root = (Query *) PG_GETARG_POINTER(0);
1589         Oid                     operator = PG_GETARG_OID(1);
1590         List       *args = (List *) PG_GETARG_POINTER(2);
1591         Oid                     eqop;
1592         float8          result;
1593
1594         /*
1595          * We want 1 - eqjoinsel() where the equality operator is the one
1596          * associated with this != operator, that is, its negator.
1597          */
1598         eqop = get_negator(operator);
1599         if (eqop)
1600         {
1601                 result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
1602                                                                                                         PointerGetDatum(root),
1603                                                                                                   ObjectIdGetDatum(eqop),
1604                                                                                                  PointerGetDatum(args)));
1605
1606         }
1607         else
1608         {
1609                 /* Use default selectivity (should we raise an error instead?) */
1610                 result = DEFAULT_EQ_SEL;
1611         }
1612         result = 1.0 - result;
1613         PG_RETURN_FLOAT8(result);
1614 }
1615
1616 /*
1617  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
1618  */
1619 Datum
1620 scalarltjoinsel(PG_FUNCTION_ARGS)
1621 {
1622         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1623 }
1624
1625 /*
1626  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
1627  */
1628 Datum
1629 scalargtjoinsel(PG_FUNCTION_ARGS)
1630 {
1631         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1632 }
1633
1634 /*
1635  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
1636  */
1637 Datum
1638 regexeqjoinsel(PG_FUNCTION_ARGS)
1639 {
1640         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1641 }
1642
1643 /*
1644  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
1645  */
1646 Datum
1647 icregexeqjoinsel(PG_FUNCTION_ARGS)
1648 {
1649         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1650 }
1651
1652 /*
1653  *              likejoinsel                     - Join selectivity of LIKE pattern match.
1654  */
1655 Datum
1656 likejoinsel(PG_FUNCTION_ARGS)
1657 {
1658         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1659 }
1660
1661 /*
1662  *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
1663  */
1664 Datum
1665 iclikejoinsel(PG_FUNCTION_ARGS)
1666 {
1667         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1668 }
1669
1670 /*
1671  *              regexnejoinsel  - Join selectivity of regex non-match.
1672  */
1673 Datum
1674 regexnejoinsel(PG_FUNCTION_ARGS)
1675 {
1676         float8          result;
1677
1678         result = DatumGetFloat8(regexeqjoinsel(fcinfo));
1679         result = 1.0 - result;
1680         PG_RETURN_FLOAT8(result);
1681 }
1682
1683 /*
1684  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
1685  */
1686 Datum
1687 icregexnejoinsel(PG_FUNCTION_ARGS)
1688 {
1689         float8          result;
1690
1691         result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
1692         result = 1.0 - result;
1693         PG_RETURN_FLOAT8(result);
1694 }
1695
1696 /*
1697  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
1698  */
1699 Datum
1700 nlikejoinsel(PG_FUNCTION_ARGS)
1701 {
1702         float8          result;
1703
1704         result = DatumGetFloat8(likejoinsel(fcinfo));
1705         result = 1.0 - result;
1706         PG_RETURN_FLOAT8(result);
1707 }
1708
1709 /*
1710  *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
1711  */
1712 Datum
1713 icnlikejoinsel(PG_FUNCTION_ARGS)
1714 {
1715         float8          result;
1716
1717         result = DatumGetFloat8(iclikejoinsel(fcinfo));
1718         result = 1.0 - result;
1719         PG_RETURN_FLOAT8(result);
1720 }
1721
1722 /*
1723  * mergejoinscansel                     - Scan selectivity of merge join.
1724  *
1725  * A merge join will stop as soon as it exhausts either input stream.
1726  * Therefore, if we can estimate the ranges of both input variables,
1727  * we can estimate how much of the input will actually be read.  This
1728  * can have a considerable impact on the cost when using indexscans.
1729  *
1730  * clause should be a clause already known to be mergejoinable.
1731  *
1732  * *leftscan is set to the fraction of the left-hand variable expected
1733  * to be scanned (0 to 1), and similarly *rightscan for the right-hand
1734  * variable.
1735  */
1736 void
1737 mergejoinscansel(Query *root, Node *clause,
1738                                  Selectivity *leftscan,
1739                                  Selectivity *rightscan)
1740 {
1741         Var                *left,
1742                            *right;
1743         Oid                     opno,
1744                                 lsortop,
1745                                 rsortop,
1746                                 ltop,
1747                                 gtop,
1748                                 revltop;
1749         Datum           leftmax,
1750                                 rightmax;
1751         double          selec;
1752
1753         /* Set default results if we can't figure anything out. */
1754         *leftscan = *rightscan = 1.0;
1755
1756         /* Deconstruct the merge clause */
1757         if (!is_opclause(clause))
1758                 return;                                 /* shouldn't happen */
1759         opno = ((Oper *) ((Expr *) clause)->oper)->opno;
1760         left = get_leftop((Expr *) clause);
1761         right = get_rightop((Expr *) clause);
1762         if (!right)
1763                 return;                                 /* shouldn't happen */
1764
1765         /* Can't do anything if inputs are not Vars */
1766         if (!IsA(left, Var) ||!IsA(right, Var))
1767                 return;
1768
1769         /* Verify mergejoinability and get left and right "<" operators */
1770         if (!op_mergejoinable(opno,
1771                                                   left->vartype,
1772                                                   right->vartype,
1773                                                   &lsortop,
1774                                                   &rsortop))
1775                 return;                                 /* shouldn't happen */
1776
1777         /* Try to get maximum values of both vars */
1778         if (!get_var_maximum(root, left, lsortop, &leftmax))
1779                 return;                                 /* no max available from stats */
1780
1781         if (!get_var_maximum(root, right, rsortop, &rightmax))
1782                 return;                                 /* no max available from stats */
1783
1784         /* Look up the "left < right" and "left > right" operators */
1785         op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);
1786
1787         /* Look up the "right < left" operator */
1788         revltop = get_commutator(gtop);
1789         if (!OidIsValid(revltop))
1790                 return;                                 /* shouldn't happen */
1791
1792         /*
1793          * Now, the fraction of the left variable that will be scanned is the
1794          * fraction that's <= the right-side maximum value.  But only believe
1795          * non-default estimates, else stick with our 1.0.
1796          */
1797         selec = scalarineqsel(root, ltop, false, left,
1798                                                   rightmax, right->vartype);
1799         if (selec != DEFAULT_INEQ_SEL)
1800                 *leftscan = selec;
1801
1802         /* And similarly for the right variable. */
1803         selec = scalarineqsel(root, revltop, false, right,
1804                                                   leftmax, left->vartype);
1805         if (selec != DEFAULT_INEQ_SEL)
1806                 *rightscan = selec;
1807
1808         /*
1809          * Only one of the two fractions can really be less than 1.0; believe
1810          * the smaller estimate and reset the other one to exactly 1.0.
1811          */
1812         if (*leftscan > *rightscan)
1813                 *leftscan = 1.0;
1814         else
1815                 *rightscan = 1.0;
1816 }
1817
1818 /*
1819  * get_var_maximum
1820  *              Estimate the maximum value of the specified variable.
1821  *              If successful, store value in *max and return TRUE.
1822  *              If no data available, return FALSE.
1823  *
1824  * sortop is the "<" comparison operator to use.  (To extract the
1825  * minimum instead of the maximum, just pass the ">" operator instead.)
1826  */
1827 static bool
1828 get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max)
1829 {
1830         Datum           tmax = 0;
1831         bool            have_max = false;
1832         Oid                     relid;
1833         HeapTuple       statsTuple;
1834         Form_pg_statistic stats;
1835         int16           typLen;
1836         bool            typByVal;
1837         Datum      *values;
1838         int                     nvalues;
1839         int                     i;
1840
1841         relid = getrelid(var->varno, root->rtable);
1842         if (relid == InvalidOid)
1843                 return false;
1844
1845         /* get stats for the attribute */
1846         statsTuple = SearchSysCache(STATRELATT,
1847                                                                 ObjectIdGetDatum(relid),
1848                                                                 Int16GetDatum(var->varattno),
1849                                                                 0, 0);
1850         if (!HeapTupleIsValid(statsTuple))
1851         {
1852                 /* no stats available, so default result */
1853                 return false;
1854         }
1855         stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
1856
1857         get_typlenbyval(var->vartype, &typLen, &typByVal);
1858
1859         /*
1860          * If there is a histogram, grab the last or first value as appropriate.
1861          *
1862          * If there is a histogram that is sorted with some other operator
1863          * than the one we want, fail --- this suggests that there is data
1864          * we can't use.
1865          */
1866         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
1867                                                  STATISTIC_KIND_HISTOGRAM, sortop,
1868                                                  &values, &nvalues,
1869                                                  NULL, NULL))
1870         {
1871                 if (nvalues > 0)
1872                 {
1873                         tmax = datumCopy(values[nvalues-1], typByVal, typLen);
1874                         have_max = true;
1875                 }
1876                 free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
1877         }
1878         else
1879         {
1880                 Oid             rsortop = get_commutator(sortop);
1881
1882                 if (OidIsValid(rsortop) &&
1883                         get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
1884                                                          STATISTIC_KIND_HISTOGRAM, rsortop,
1885                                                          &values, &nvalues,
1886                                                          NULL, NULL))
1887                 {
1888                         if (nvalues > 0)
1889                         {
1890                                 tmax = datumCopy(values[0], typByVal, typLen);
1891                                 have_max = true;
1892                         }
1893                         free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
1894                 }
1895                 else if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
1896                                                                   STATISTIC_KIND_HISTOGRAM, InvalidOid,
1897                                                                   &values, &nvalues,
1898                                                                   NULL, NULL))
1899                 {
1900                         free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
1901                         ReleaseSysCache(statsTuple);
1902                         return false;
1903                 }
1904         }
1905
1906         /*
1907          * If we have most-common-values info, look for a large MCV.  This
1908          * is needed even if we also have a histogram, since the histogram
1909          * excludes the MCVs.  However, usually the MCVs will not be the
1910          * extreme values, so avoid unnecessary data copying.
1911          */
1912         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
1913                                                  STATISTIC_KIND_MCV, InvalidOid,
1914                                                  &values, &nvalues,
1915                                                  NULL, NULL))
1916         {
1917                 bool    large_mcv = false;
1918                 FmgrInfo        opproc;
1919
1920                 fmgr_info(get_opcode(sortop), &opproc);
1921
1922                 for (i = 0; i < nvalues; i++)
1923                 {
1924                         if (!have_max)
1925                         {
1926                                 tmax = values[i];
1927                                 large_mcv = have_max = true;
1928                         }
1929                         else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
1930                         {
1931                                 tmax = values[i];
1932                                 large_mcv = true;
1933                         }
1934                 }
1935                 if (large_mcv)
1936                         tmax = datumCopy(tmax, typByVal, typLen);
1937                 free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
1938         }
1939
1940         ReleaseSysCache(statsTuple);
1941
1942         *max = tmax;
1943         return have_max;
1944 }
1945
1946 /*
1947  * convert_to_scalar
1948  *        Convert non-NULL values of the indicated types to the comparison
1949  *        scale needed by scalarltsel()/scalargtsel().
1950  *        Returns "true" if successful.
1951  *
1952  * XXX this routine is a hack: ideally we should look up the conversion
1953  * subroutines in pg_type.
1954  *
1955  * All numeric datatypes are simply converted to their equivalent
1956  * "double" values.  (NUMERIC values that are outside the range of "double"
1957  * are clamped to +/- HUGE_VAL.)
1958  *
1959  * String datatypes are converted by convert_string_to_scalar(),
1960  * which is explained below.  The reason why this routine deals with
1961  * three values at a time, not just one, is that we need it for strings.
1962  *
1963  * The bytea datatype is just enough different from strings that it has
1964  * to be treated separately.
1965  *
1966  * The several datatypes representing absolute times are all converted
1967  * to Timestamp, which is actually a double, and then we just use that
1968  * double value.  Note this will give correct results even for the "special"
1969  * values of Timestamp, since those are chosen to compare correctly;
1970  * see timestamp_cmp.
1971  *
1972  * The several datatypes representing relative times (intervals) are all
1973  * converted to measurements expressed in seconds.
1974  */
1975 static bool
1976 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
1977                                   Datum lobound, Datum hibound, Oid boundstypid,
1978                                   double *scaledlobound, double *scaledhibound)
1979 {
1980         switch (valuetypid)
1981         {
1982                         /*
1983                          * Built-in numeric types
1984                          */
1985                 case BOOLOID:
1986                 case INT2OID:
1987                 case INT4OID:
1988                 case INT8OID:
1989                 case FLOAT4OID:
1990                 case FLOAT8OID:
1991                 case NUMERICOID:
1992                 case OIDOID:
1993                 case REGPROCOID:
1994                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
1995                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
1996                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
1997                         return true;
1998
1999                         /*
2000                          * Built-in string types
2001                          */
2002                 case CHAROID:
2003                 case BPCHAROID:
2004                 case VARCHAROID:
2005                 case TEXTOID:
2006                 case NAMEOID:
2007                         {
2008                                 unsigned char *valstr = convert_string_datum(value, valuetypid);
2009                                 unsigned char *lostr = convert_string_datum(lobound, boundstypid);
2010                                 unsigned char *histr = convert_string_datum(hibound, boundstypid);
2011
2012                                 convert_string_to_scalar(valstr, scaledvalue,
2013                                                                                  lostr, scaledlobound,
2014                                                                                  histr, scaledhibound);
2015                                 pfree(valstr);
2016                                 pfree(lostr);
2017                                 pfree(histr);
2018                                 return true;
2019                         }
2020
2021                         /*
2022                          * Built-in bytea type
2023                          */
2024                 case BYTEAOID:
2025                         {
2026                                 convert_bytea_to_scalar(value, scaledvalue,
2027                                                                                 lobound, scaledlobound,
2028                                                                                 hibound, scaledhibound);
2029                                 return true;
2030                         }
2031
2032                         /*
2033                          * Built-in time types
2034                          */
2035                 case TIMESTAMPOID:
2036                 case TIMESTAMPTZOID:
2037                 case ABSTIMEOID:
2038                 case DATEOID:
2039                 case INTERVALOID:
2040                 case RELTIMEOID:
2041                 case TINTERVALOID:
2042                 case TIMEOID:
2043                 case TIMETZOID:
2044                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2045                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2046                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2047                         return true;
2048
2049                         /*
2050                          * Built-in network types
2051                          */
2052                 case INETOID:
2053                 case CIDROID:
2054                 case MACADDROID:
2055                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
2056                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2057                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2058                         return true;
2059         }
2060         /* Don't know how to convert */
2061         return false;
2062 }
2063
2064 /*
2065  * Do convert_to_scalar()'s work for any numeric data type.
2066  */
2067 static double
2068 convert_numeric_to_scalar(Datum value, Oid typid)
2069 {
2070         switch (typid)
2071         {
2072                 case BOOLOID:
2073                         return (double) DatumGetBool(value);
2074                 case INT2OID:
2075                         return (double) DatumGetInt16(value);
2076                 case INT4OID:
2077                         return (double) DatumGetInt32(value);
2078                 case INT8OID:
2079                         return (double) DatumGetInt64(value);
2080                 case FLOAT4OID:
2081                         return (double) DatumGetFloat4(value);
2082                 case FLOAT8OID:
2083                         return (double) DatumGetFloat8(value);
2084                 case NUMERICOID:
2085                         /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2086                         return (double)
2087                                 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2088                                                                                                    value));
2089                 case OIDOID:
2090                 case REGPROCOID:
2091                         /* we can treat OIDs as integers... */
2092                         return (double) DatumGetObjectId(value);
2093         }
2094
2095         /*
2096          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2097          * on an operator with one numeric and one non-numeric operand.
2098          */
2099         elog(ERROR, "convert_numeric_to_scalar: unsupported type %u", typid);
2100         return 0;
2101 }
2102
2103 /*
2104  * Do convert_to_scalar()'s work for any character-string data type.
2105  *
2106  * String datatypes are converted to a scale that ranges from 0 to 1,
2107  * where we visualize the bytes of the string as fractional digits.
2108  *
2109  * We do not want the base to be 256, however, since that tends to
2110  * generate inflated selectivity estimates; few databases will have
2111  * occurrences of all 256 possible byte values at each position.
2112  * Instead, use the smallest and largest byte values seen in the bounds
2113  * as the estimated range for each byte, after some fudging to deal with
2114  * the fact that we probably aren't going to see the full range that way.
2115  *
2116  * An additional refinement is that we discard any common prefix of the
2117  * three strings before computing the scaled values.  This allows us to
2118  * "zoom in" when we encounter a narrow data range.  An example is a phone
2119  * number database where all the values begin with the same area code.
2120  * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2121  * so this is more likely to happen than you might think.)
2122  */
2123 static void
2124 convert_string_to_scalar(unsigned char *value,
2125                                                  double *scaledvalue,
2126                                                  unsigned char *lobound,
2127                                                  double *scaledlobound,
2128                                                  unsigned char *hibound,
2129                                                  double *scaledhibound)
2130 {
2131         int                     rangelo,
2132                                 rangehi;
2133         unsigned char *sptr;
2134
2135         rangelo = rangehi = hibound[0];
2136         for (sptr = lobound; *sptr; sptr++)
2137         {
2138                 if (rangelo > *sptr)
2139                         rangelo = *sptr;
2140                 if (rangehi < *sptr)
2141                         rangehi = *sptr;
2142         }
2143         for (sptr = hibound; *sptr; sptr++)
2144         {
2145                 if (rangelo > *sptr)
2146                         rangelo = *sptr;
2147                 if (rangehi < *sptr)
2148                         rangehi = *sptr;
2149         }
2150         /* If range includes any upper-case ASCII chars, make it include all */
2151         if (rangelo <= 'Z' && rangehi >= 'A')
2152         {
2153                 if (rangelo > 'A')
2154                         rangelo = 'A';
2155                 if (rangehi < 'Z')
2156                         rangehi = 'Z';
2157         }
2158         /* Ditto lower-case */
2159         if (rangelo <= 'z' && rangehi >= 'a')
2160         {
2161                 if (rangelo > 'a')
2162                         rangelo = 'a';
2163                 if (rangehi < 'z')
2164                         rangehi = 'z';
2165         }
2166         /* Ditto digits */
2167         if (rangelo <= '9' && rangehi >= '0')
2168         {
2169                 if (rangelo > '0')
2170                         rangelo = '0';
2171                 if (rangehi < '9')
2172                         rangehi = '9';
2173         }
2174
2175         /*
2176          * If range includes less than 10 chars, assume we have not got enough
2177          * data, and make it include regular ASCII set.
2178          */
2179         if (rangehi - rangelo < 9)
2180         {
2181                 rangelo = ' ';
2182                 rangehi = 127;
2183         }
2184
2185         /*
2186          * Now strip any common prefix of the three strings.
2187          */
2188         while (*lobound)
2189         {
2190                 if (*lobound != *hibound || *lobound != *value)
2191                         break;
2192                 lobound++, hibound++, value++;
2193         }
2194
2195         /*
2196          * Now we can do the conversions.
2197          */
2198         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2199         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2200         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2201 }
2202
2203 static double
2204 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
2205 {
2206         int                     slen = strlen((char *) value);
2207         double          num,
2208                                 denom,
2209                                 base;
2210
2211         if (slen <= 0)
2212                 return 0.0;                             /* empty string has scalar value 0 */
2213
2214         /*
2215          * Since base is at least 10, need not consider more than about 20
2216          * chars
2217          */
2218         if (slen > 20)
2219                 slen = 20;
2220
2221         /* Convert initial characters to fraction */
2222         base = rangehi - rangelo + 1;
2223         num = 0.0;
2224         denom = base;
2225         while (slen-- > 0)
2226         {
2227                 int                     ch = *value++;
2228
2229                 if (ch < rangelo)
2230                         ch = rangelo - 1;
2231                 else if (ch > rangehi)
2232                         ch = rangehi + 1;
2233                 num += ((double) (ch - rangelo)) / denom;
2234                 denom *= base;
2235         }
2236
2237         return num;
2238 }
2239
2240 /*
2241  * Convert a string-type Datum into a palloc'd, null-terminated string.
2242  *
2243  * If USE_LOCALE is defined, we must pass the string through strxfrm()
2244  * before continuing, so as to generate correct locale-specific results.
2245  */
2246 static unsigned char *
2247 convert_string_datum(Datum value, Oid typid)
2248 {
2249         char       *val;
2250
2251 #ifdef USE_LOCALE
2252         char       *xfrmstr;
2253         size_t          xfrmsize;
2254         size_t          xfrmlen;
2255 #endif
2256
2257         switch (typid)
2258         {
2259                 case CHAROID:
2260                         val = (char *) palloc(2);
2261                         val[0] = DatumGetChar(value);
2262                         val[1] = '\0';
2263                         break;
2264                 case BPCHAROID:
2265                 case VARCHAROID:
2266                 case TEXTOID:
2267                         {
2268                                 char       *str = (char *) VARDATA(DatumGetPointer(value));
2269                                 int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
2270
2271                                 val = (char *) palloc(strlength + 1);
2272                                 memcpy(val, str, strlength);
2273                                 val[strlength] = '\0';
2274                                 break;
2275                         }
2276                 case NAMEOID:
2277                         {
2278                                 NameData   *nm = (NameData *) DatumGetPointer(value);
2279
2280                                 val = pstrdup(NameStr(*nm));
2281                                 break;
2282                         }
2283                 default:
2284
2285                         /*
2286                          * Can't get here unless someone tries to use scalarltsel on
2287                          * an operator with one string and one non-string operand.
2288                          */
2289                         elog(ERROR, "convert_string_datum: unsupported type %u", typid);
2290                         return NULL;
2291         }
2292
2293 #ifdef USE_LOCALE
2294         /* Guess that transformed string is not much bigger than original */
2295         xfrmsize = strlen(val) + 32;    /* arbitrary pad value here... */
2296         xfrmstr = (char *) palloc(xfrmsize);
2297         xfrmlen = strxfrm(xfrmstr, val, xfrmsize);
2298         if (xfrmlen >= xfrmsize)
2299         {
2300                 /* Oops, didn't make it */
2301                 pfree(xfrmstr);
2302                 xfrmstr = (char *) palloc(xfrmlen + 1);
2303                 xfrmlen = strxfrm(xfrmstr, val, xfrmlen + 1);
2304         }
2305         pfree(val);
2306         val = xfrmstr;
2307 #endif
2308
2309         return (unsigned char *) val;
2310 }
2311
2312 /*
2313  * Do convert_to_scalar()'s work for any bytea data type.
2314  *
2315  * Very similar to convert_string_to_scalar except we can't assume
2316  * null-termination and therefore pass explicit lengths around.
2317  *
2318  * Also, assumptions about likely "normal" ranges of characters have been
2319  * removed - a data range of 0..255 is always used, for now.  (Perhaps
2320  * someday we will add information about actual byte data range to
2321  * pg_statistic.)
2322  */
2323 static void
2324 convert_bytea_to_scalar(Datum value,
2325                                                 double *scaledvalue,
2326                                                 Datum lobound,
2327                                                 double *scaledlobound,
2328                                                 Datum hibound,
2329                                                 double *scaledhibound)
2330 {
2331         int                     rangelo,
2332                                 rangehi,
2333                                 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
2334                                 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
2335                                 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
2336                                 i,
2337                                 minlen;
2338         unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
2339                            *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
2340                            *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
2341
2342         /*
2343          * Assume bytea data is uniformly distributed across all byte values.
2344          */
2345         rangelo = 0;
2346         rangehi = 255;
2347
2348         /*
2349          * Now strip any common prefix of the three strings.
2350          */
2351         minlen = Min(Min(valuelen, loboundlen), hiboundlen);
2352         for (i = 0; i < minlen; i++)
2353         {
2354                 if (*lostr != *histr || *lostr != *valstr)
2355                         break;
2356                 lostr++, histr++, valstr++;
2357                 loboundlen--, hiboundlen--, valuelen--;
2358         }
2359
2360         /*
2361          * Now we can do the conversions.
2362          */
2363         *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
2364         *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
2365         *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
2366 }
2367
2368 static double
2369 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
2370                                                         int rangelo, int rangehi)
2371 {
2372         double          num,
2373                                 denom,
2374                                 base;
2375
2376         if (valuelen <= 0)
2377                 return 0.0;                             /* empty string has scalar value 0 */
2378
2379         /*
2380          * Since base is 256, need not consider more than about 10 chars (even
2381          * this many seems like overkill)
2382          */
2383         if (valuelen > 10)
2384                 valuelen = 10;
2385
2386         /* Convert initial characters to fraction */
2387         base = rangehi - rangelo + 1;
2388         num = 0.0;
2389         denom = base;
2390         while (valuelen-- > 0)
2391         {
2392                 int                     ch = *value++;
2393
2394                 if (ch < rangelo)
2395                         ch = rangelo - 1;
2396                 else if (ch > rangehi)
2397                         ch = rangehi + 1;
2398                 num += ((double) (ch - rangelo)) / denom;
2399                 denom *= base;
2400         }
2401
2402         return num;
2403 }
2404
2405 /*
2406  * Do convert_to_scalar()'s work for any timevalue data type.
2407  */
2408 static double
2409 convert_timevalue_to_scalar(Datum value, Oid typid)
2410 {
2411         switch (typid)
2412         {
2413                 case TIMESTAMPOID:
2414                         return DatumGetTimestamp(value);
2415                 case TIMESTAMPTZOID:
2416                         return DatumGetTimestampTz(value);
2417                 case ABSTIMEOID:
2418                         return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
2419                                                                                                                  value));
2420                 case DATEOID:
2421                         return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
2422                                                                                                                  value));
2423                 case INTERVALOID:
2424                         {
2425                                 Interval   *interval = DatumGetIntervalP(value);
2426
2427                                 /*
2428                                  * Convert the month part of Interval to days using
2429                                  * assumed average month length of 365.25/12.0 days.  Not
2430                                  * too accurate, but plenty good enough for our purposes.
2431                                  */
2432                                 return interval->time +
2433                                         interval->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
2434                         }
2435                 case RELTIMEOID:
2436                         return DatumGetRelativeTime(value);
2437                 case TINTERVALOID:
2438                         {
2439                                 TimeInterval interval = DatumGetTimeInterval(value);
2440
2441                                 if (interval->status != 0)
2442                                         return interval->data[1] - interval->data[0];
2443                                 return 0;               /* for lack of a better idea */
2444                         }
2445                 case TIMEOID:
2446                         return DatumGetTimeADT(value);
2447                 case TIMETZOID:
2448                         {
2449                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
2450
2451                                 /* use GMT-equivalent time */
2452                                 return (double) (timetz->time + timetz->zone);
2453                         }
2454         }
2455
2456         /*
2457          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2458          * on an operator with one timevalue and one non-timevalue operand.
2459          */
2460         elog(ERROR, "convert_timevalue_to_scalar: unsupported type %u", typid);
2461         return 0;
2462 }
2463
2464
2465 /*
2466  * get_att_numdistinct
2467  *        Estimate the number of distinct values of an attribute.
2468  *
2469  * var: identifies the attribute to examine.
2470  * stats: pg_statistic tuple for attribute, or NULL if not available.
2471  *
2472  * NB: be careful to produce an integral result, since callers may compare
2473  * the result to exact integer counts.
2474  */
2475 static double
2476 get_att_numdistinct(Query *root, Var *var, Form_pg_statistic stats)
2477 {
2478         RelOptInfo *rel;
2479         double          ntuples;
2480
2481         /*
2482          * Special-case boolean columns: presumably, two distinct values.
2483          *
2484          * Are there any other cases we should wire in special estimates for?
2485          */
2486         if (var->vartype == BOOLOID)
2487                 return 2.0;
2488
2489         /*
2490          * Otherwise we need to get the relation size.
2491          */
2492         rel = find_base_rel(root, var->varno);
2493         ntuples = rel->tuples;
2494
2495         if (ntuples <= 0.0)
2496                 return DEFAULT_NUM_DISTINCT;    /* no data available; return a
2497                                                                                  * default */
2498
2499         /*
2500          * Look to see if there is a unique index on the attribute. If so, we
2501          * assume it's distinct, ignoring pg_statistic info which could be out
2502          * of date.
2503          */
2504         if (has_unique_index(rel, var->varattno))
2505                 return ntuples;
2506
2507         /*
2508          * If ANALYZE determined a fixed or scaled estimate, use it.
2509          */
2510         if (stats)
2511         {
2512                 if (stats->stadistinct > 0.0)
2513                         return stats->stadistinct;
2514                 if (stats->stadistinct < 0.0)
2515                         return floor((-stats->stadistinct * ntuples) + 0.5);
2516         }
2517
2518         /*
2519          * ANALYZE does not compute stats for system attributes, but some of
2520          * them can reasonably be assumed unique anyway.
2521          */
2522         switch (var->varattno)
2523         {
2524                 case ObjectIdAttributeNumber:
2525                 case SelfItemPointerAttributeNumber:
2526                         return ntuples;
2527                 case TableOidAttributeNumber:
2528                         return 1.0;
2529         }
2530
2531         /*
2532          * Estimate ndistinct = ntuples if the table is small, else use
2533          * default.
2534          */
2535         if (ntuples < DEFAULT_NUM_DISTINCT)
2536                 return ntuples;
2537
2538         return DEFAULT_NUM_DISTINCT;
2539 }
2540
2541 /*
2542  * get_restriction_var
2543  *              Examine the args of a restriction clause to see if it's of the
2544  *              form (var op something) or (something op var).  If so, extract
2545  *              and return the var and the other argument.
2546  *
2547  * Inputs:
2548  *      args: clause argument list
2549  *      varRelid: see specs for restriction selectivity functions
2550  *
2551  * Outputs: (these are set only if TRUE is returned)
2552  *      *var: gets Var node
2553  *      *other: gets other clause argument
2554  *      *varonleft: set TRUE if var is on the left, FALSE if on the right
2555  *
2556  * Returns TRUE if a Var is identified, otherwise FALSE.
2557  */
2558 static bool
2559 get_restriction_var(List *args,
2560                                         int varRelid,
2561                                         Var **var,
2562                                         Node **other,
2563                                         bool *varonleft)
2564 {
2565         Node       *left,
2566                            *right;
2567
2568         if (length(args) != 2)
2569                 return false;
2570
2571         left = (Node *) lfirst(args);
2572         right = (Node *) lsecond(args);
2573
2574         /* Ignore any binary-compatible relabeling */
2575
2576         if (IsA(left, RelabelType))
2577                 left = ((RelabelType *) left)->arg;
2578         if (IsA(right, RelabelType))
2579                 right = ((RelabelType *) right)->arg;
2580
2581         /* Look for the var */
2582
2583         if (IsA(left, Var) &&
2584                 (varRelid == 0 || varRelid == ((Var *) left)->varno))
2585         {
2586                 *var = (Var *) left;
2587                 *other = right;
2588                 *varonleft = true;
2589         }
2590         else if (IsA(right, Var) &&
2591                          (varRelid == 0 || varRelid == ((Var *) right)->varno))
2592         {
2593                 *var = (Var *) right;
2594                 *other = left;
2595                 *varonleft = false;
2596         }
2597         else
2598         {
2599                 /* Duh, it's too complicated for me... */
2600                 return false;
2601         }
2602
2603         return true;
2604 }
2605
2606 /*
2607  * get_join_vars
2608  *
2609  * Extract the two Vars from a join clause's argument list.  Returns
2610  * NULL for arguments that are not simple vars.
2611  */
2612 static void
2613 get_join_vars(List *args, Var **var1, Var **var2)
2614 {
2615         Node       *left,
2616                            *right;
2617
2618         if (length(args) != 2)
2619         {
2620                 *var1 = NULL;
2621                 *var2 = NULL;
2622                 return;
2623         }
2624
2625         left = (Node *) lfirst(args);
2626         right = (Node *) lsecond(args);
2627
2628         /* Ignore any binary-compatible relabeling */
2629         if (IsA(left, RelabelType))
2630                 left = ((RelabelType *) left)->arg;
2631         if (IsA(right, RelabelType))
2632                 right = ((RelabelType *) right)->arg;
2633
2634         if (IsA(left, Var))
2635                 *var1 = (Var *) left;
2636         else
2637                 *var1 = NULL;
2638
2639         if (IsA(right, Var))
2640                 *var2 = (Var *) right;
2641         else
2642                 *var2 = NULL;
2643 }
2644
2645 /*-------------------------------------------------------------------------
2646  *
2647  * Pattern analysis functions
2648  *
2649  * These routines support analysis of LIKE and regular-expression patterns
2650  * by the planner/optimizer.  It's important that they agree with the
2651  * regular-expression code in backend/regex/ and the LIKE code in
2652  * backend/utils/adt/like.c.
2653  *
2654  * Note that the prefix-analysis functions are called from
2655  * backend/optimizer/path/indxpath.c as well as from routines in this file.
2656  *
2657  *-------------------------------------------------------------------------
2658  */
2659
2660 /*
2661  * Extract the fixed prefix, if any, for a pattern.
2662  * *prefix is set to a palloc'd prefix string,
2663  * or to NULL if no fixed prefix exists for the pattern.
2664  * *rest is set to point to the remainder of the pattern after the
2665  * portion describing the fixed prefix.
2666  * The return value distinguishes no fixed prefix, a partial prefix,
2667  * or an exact-match-only pattern.
2668  */
2669
2670 static Pattern_Prefix_Status
2671 like_fixed_prefix(char *patt, bool case_insensitive,
2672                                   char **prefix, char **rest)
2673 {
2674         char       *match;
2675         int                     pos,
2676                                 match_pos;
2677
2678         *prefix = match = palloc(strlen(patt) + 1);
2679         match_pos = 0;
2680
2681         for (pos = 0; patt[pos]; pos++)
2682         {
2683                 /* % and _ are wildcard characters in LIKE */
2684                 if (patt[pos] == '%' ||
2685                         patt[pos] == '_')
2686                         break;
2687                 /* Backslash quotes the next character */
2688                 if (patt[pos] == '\\')
2689                 {
2690                         pos++;
2691                         if (patt[pos] == '\0')
2692                                 break;
2693                 }
2694
2695                 /*
2696                  * XXX I suspect isalpha() is not an adequately locale-sensitive
2697                  * test for characters that can vary under case folding?
2698                  */
2699                 if (case_insensitive && isalpha((unsigned char) patt[pos]))
2700                         break;
2701
2702                 /*
2703                  * NOTE: this code used to think that %% meant a literal %, but
2704                  * textlike() itself does not think that, and the SQL92 spec
2705                  * doesn't say any such thing either.
2706                  */
2707                 match[match_pos++] = patt[pos];
2708         }
2709
2710         match[match_pos] = '\0';
2711         *rest = &patt[pos];
2712
2713         /* in LIKE, an empty pattern is an exact match! */
2714         if (patt[pos] == '\0')
2715                 return Pattern_Prefix_Exact;    /* reached end of pattern, so
2716                                                                                  * exact */
2717
2718         if (match_pos > 0)
2719                 return Pattern_Prefix_Partial;
2720
2721         pfree(match);
2722         *prefix = NULL;
2723         return Pattern_Prefix_None;
2724 }
2725
2726 static Pattern_Prefix_Status
2727 regex_fixed_prefix(char *patt, bool case_insensitive,
2728                                    char **prefix, char **rest)
2729 {
2730         char       *match;
2731         int                     pos,
2732                                 match_pos,
2733                                 paren_depth;
2734
2735         /* Pattern must be anchored left */
2736         if (patt[0] != '^')
2737         {
2738                 *prefix = NULL;
2739                 *rest = patt;
2740                 return Pattern_Prefix_None;
2741         }
2742
2743         /*
2744          * If unquoted | is present at paren level 0 in pattern, then there
2745          * are multiple alternatives for the start of the string.
2746          */
2747         paren_depth = 0;
2748         for (pos = 1; patt[pos]; pos++)
2749         {
2750                 if (patt[pos] == '|' && paren_depth == 0)
2751                 {
2752                         *prefix = NULL;
2753                         *rest = patt;
2754                         return Pattern_Prefix_None;
2755                 }
2756                 else if (patt[pos] == '(')
2757                         paren_depth++;
2758                 else if (patt[pos] == ')' && paren_depth > 0)
2759                         paren_depth--;
2760                 else if (patt[pos] == '\\')
2761                 {
2762                         /* backslash quotes the next character */
2763                         pos++;
2764                         if (patt[pos] == '\0')
2765                                 break;
2766                 }
2767         }
2768
2769         /* OK, allocate space for pattern */
2770         *prefix = match = palloc(strlen(patt) + 1);
2771         match_pos = 0;
2772
2773         /* note start at pos 1 to skip leading ^ */
2774         for (pos = 1; patt[pos]; pos++)
2775         {
2776                 /*
2777                  * Check for characters that indicate multiple possible matches
2778                  * here. XXX I suspect isalpha() is not an adequately
2779                  * locale-sensitive test for characters that can vary under case
2780                  * folding?
2781                  */
2782                 if (patt[pos] == '.' ||
2783                         patt[pos] == '(' ||
2784                         patt[pos] == '[' ||
2785                         patt[pos] == '$' ||
2786                         (case_insensitive && isalpha((unsigned char) patt[pos])))
2787                         break;
2788
2789                 /*
2790                  * Check for quantifiers.  Except for +, this means the preceding
2791                  * character is optional, so we must remove it from the prefix
2792                  * too!
2793                  */
2794                 if (patt[pos] == '*' ||
2795                         patt[pos] == '?' ||
2796                         patt[pos] == '{')
2797                 {
2798                         if (match_pos > 0)
2799                                 match_pos--;
2800                         pos--;
2801                         break;
2802                 }
2803                 if (patt[pos] == '+')
2804                 {
2805                         pos--;
2806                         break;
2807                 }
2808                 if (patt[pos] == '\\')
2809                 {
2810                         /* backslash quotes the next character */
2811                         pos++;
2812                         if (patt[pos] == '\0')
2813                                 break;
2814                 }
2815                 match[match_pos++] = patt[pos];
2816         }
2817
2818         match[match_pos] = '\0';
2819         *rest = &patt[pos];
2820
2821         if (patt[pos] == '$' && patt[pos + 1] == '\0')
2822         {
2823                 *rest = &patt[pos + 1];
2824                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
2825         }
2826
2827         if (match_pos > 0)
2828                 return Pattern_Prefix_Partial;
2829
2830         pfree(match);
2831         *prefix = NULL;
2832         return Pattern_Prefix_None;
2833 }
2834
2835 Pattern_Prefix_Status
2836 pattern_fixed_prefix(char *patt, Pattern_Type ptype,
2837                                          char **prefix, char **rest)
2838 {
2839         Pattern_Prefix_Status result;
2840
2841         switch (ptype)
2842         {
2843                 case Pattern_Type_Like:
2844                         result = like_fixed_prefix(patt, false, prefix, rest);
2845                         break;
2846                 case Pattern_Type_Like_IC:
2847                         result = like_fixed_prefix(patt, true, prefix, rest);
2848                         break;
2849                 case Pattern_Type_Regex:
2850                         result = regex_fixed_prefix(patt, false, prefix, rest);
2851                         break;
2852                 case Pattern_Type_Regex_IC:
2853                         result = regex_fixed_prefix(patt, true, prefix, rest);
2854                         break;
2855                 default:
2856                         elog(ERROR, "pattern_fixed_prefix: bogus ptype");
2857                         result = Pattern_Prefix_None;           /* keep compiler quiet */
2858                         break;
2859         }
2860         return result;
2861 }
2862
2863 /*
2864  * Estimate the selectivity of a fixed prefix for a pattern match.
2865  *
2866  * A fixed prefix "foo" is estimated as the selectivity of the expression
2867  * "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
2868  *
2869  * XXX Note: we make use of the upper bound to estimate operator selectivity
2870  * even if the locale is such that we cannot rely on the upper-bound string.
2871  * The selectivity only needs to be approximately right anyway, so it seems
2872  * more useful to use the upper-bound code than not.
2873  */
2874 static Selectivity
2875 prefix_selectivity(Query *root, Var *var, char *prefix)
2876 {
2877         Selectivity prefixsel;
2878         Oid                     cmpopr;
2879         Const      *prefixcon;
2880         List       *cmpargs;
2881         char       *greaterstr;
2882
2883         cmpopr = find_operator(">=", var->vartype);
2884         if (cmpopr == InvalidOid)
2885                 elog(ERROR, "prefix_selectivity: no >= operator for type %u",
2886                          var->vartype);
2887         prefixcon = string_to_const(prefix, var->vartype);
2888         cmpargs = makeList2(var, prefixcon);
2889         /* Assume scalargtsel is appropriate for all supported types */
2890         prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
2891                                                                                                    PointerGetDatum(root),
2892                                                                                                 ObjectIdGetDatum(cmpopr),
2893                                                                                                 PointerGetDatum(cmpargs),
2894                                                                                                    Int32GetDatum(0)));
2895
2896         /*-------
2897          * If we can create a string larger than the prefix, say
2898          *      "x < greaterstr".
2899          *-------
2900          */
2901         greaterstr = make_greater_string(prefix, var->vartype);
2902         if (greaterstr)
2903         {
2904                 Selectivity topsel;
2905
2906                 cmpopr = find_operator("<", var->vartype);
2907                 if (cmpopr == InvalidOid)
2908                         elog(ERROR, "prefix_selectivity: no < operator for type %u",
2909                                  var->vartype);
2910                 prefixcon = string_to_const(greaterstr, var->vartype);
2911                 cmpargs = makeList2(var, prefixcon);
2912                 /* Assume scalarltsel is appropriate for all supported types */
2913                 topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
2914                                                                                                         PointerGetDatum(root),
2915                                                                                                 ObjectIdGetDatum(cmpopr),
2916                                                                                                 PointerGetDatum(cmpargs),
2917                                                                                                         Int32GetDatum(0)));
2918
2919                 /*
2920                  * Merge the two selectivities in the same way as for a range
2921                  * query (see clauselist_selectivity()).
2922                  */
2923                 prefixsel = topsel + prefixsel - 1.0;
2924
2925                 /*
2926                  * A zero or slightly negative prefixsel should be converted into
2927                  * a small positive value; we probably are dealing with a very
2928                  * tight range and got a bogus result due to roundoff errors.
2929                  * However, if prefixsel is very negative, then we probably have
2930                  * default selectivity estimates on one or both sides of the
2931                  * range.  In that case, insert a not-so-wildly-optimistic default
2932                  * estimate.
2933                  */
2934                 if (prefixsel <= 0.0)
2935                 {
2936                         if (prefixsel < -0.01)
2937                         {
2938                                 /*
2939                                  * No data available --- use a default estimate that is
2940                                  * small, but not real small.
2941                                  */
2942                                 prefixsel = 0.005;
2943                         }
2944                         else
2945                         {
2946                                 /*
2947                                  * It's just roundoff error; use a small positive value
2948                                  */
2949                                 prefixsel = 1.0e-10;
2950                         }
2951                 }
2952         }
2953
2954         return prefixsel;
2955 }
2956
2957
2958 /*
2959  * Estimate the selectivity of a pattern of the specified type.
2960  * Note that any fixed prefix of the pattern will have been removed already.
2961  *
2962  * For now, we use a very simplistic approach: fixed characters reduce the
2963  * selectivity a good deal, character ranges reduce it a little,
2964  * wildcards (such as % for LIKE or .* for regex) increase it.
2965  */
2966
2967 #define FIXED_CHAR_SEL  0.04    /* about 1/25 */
2968 #define CHAR_RANGE_SEL  0.25
2969 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match
2970                                                                  * end-of-string */
2971 #define FULL_WILDCARD_SEL 5.0
2972 #define PARTIAL_WILDCARD_SEL 2.0
2973
2974 static Selectivity
2975 like_selectivity(char *patt, bool case_insensitive)
2976 {
2977         Selectivity sel = 1.0;
2978         int                     pos;
2979
2980         /* Skip any leading %; it's already factored into initial sel */
2981         pos = (*patt == '%') ? 1 : 0;
2982         for (; patt[pos]; pos++)
2983         {
2984                 /* % and _ are wildcard characters in LIKE */
2985                 if (patt[pos] == '%')
2986                         sel *= FULL_WILDCARD_SEL;
2987                 else if (patt[pos] == '_')
2988                         sel *= ANY_CHAR_SEL;
2989                 else if (patt[pos] == '\\')
2990                 {
2991                         /* Backslash quotes the next character */
2992                         pos++;
2993                         if (patt[pos] == '\0')
2994                                 break;
2995                         sel *= FIXED_CHAR_SEL;
2996                 }
2997                 else
2998                         sel *= FIXED_CHAR_SEL;
2999         }
3000         /* Could get sel > 1 if multiple wildcards */
3001         if (sel > 1.0)
3002                 sel = 1.0;
3003         return sel;
3004 }
3005
3006 static Selectivity
3007 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
3008 {
3009         Selectivity sel = 1.0;
3010         int                     paren_depth = 0;
3011         int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
3012         int                     pos;
3013
3014         for (pos = 0; pos < pattlen; pos++)
3015         {
3016                 if (patt[pos] == '(')
3017                 {
3018                         if (paren_depth == 0)
3019                                 paren_pos = pos;        /* remember start of parenthesized item */
3020                         paren_depth++;
3021                 }
3022                 else if (patt[pos] == ')' && paren_depth > 0)
3023                 {
3024                         paren_depth--;
3025                         if (paren_depth == 0)
3026                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
3027                                                                                          pos - (paren_pos + 1),
3028                                                                                          case_insensitive);
3029                 }
3030                 else if (patt[pos] == '|' && paren_depth == 0)
3031                 {
3032                         /*
3033                          * If unquoted | is present at paren level 0 in pattern, we
3034                          * have multiple alternatives; sum their probabilities.
3035                          */
3036                         sel += regex_selectivity_sub(patt + (pos + 1),
3037                                                                                  pattlen - (pos + 1),
3038                                                                                  case_insensitive);
3039                         break;                          /* rest of pattern is now processed */
3040                 }
3041                 else if (patt[pos] == '[')
3042                 {
3043                         bool            negclass = false;
3044
3045                         if (patt[++pos] == '^')
3046                         {
3047                                 negclass = true;
3048                                 pos++;
3049                         }
3050                         if (patt[pos] == ']')           /* ']' at start of class is not
3051                                                                                  * special */
3052                                 pos++;
3053                         while (pos < pattlen && patt[pos] != ']')
3054                                 pos++;
3055                         if (paren_depth == 0)
3056                                 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
3057                 }
3058                 else if (patt[pos] == '.')
3059                 {
3060                         if (paren_depth == 0)
3061                                 sel *= ANY_CHAR_SEL;
3062                 }
3063                 else if (patt[pos] == '*' ||
3064                                  patt[pos] == '?' ||
3065                                  patt[pos] == '+')
3066                 {
3067                         /* Ought to be smarter about quantifiers... */
3068                         if (paren_depth == 0)
3069                                 sel *= PARTIAL_WILDCARD_SEL;
3070                 }
3071                 else if (patt[pos] == '{')
3072                 {
3073                         while (pos < pattlen && patt[pos] != '}')
3074                                 pos++;
3075                         if (paren_depth == 0)
3076                                 sel *= PARTIAL_WILDCARD_SEL;
3077                 }
3078                 else if (patt[pos] == '\\')
3079                 {
3080                         /* backslash quotes the next character */
3081                         pos++;
3082                         if (pos >= pattlen)
3083                                 break;
3084                         if (paren_depth == 0)
3085                                 sel *= FIXED_CHAR_SEL;
3086                 }
3087                 else
3088                 {
3089                         if (paren_depth == 0)
3090                                 sel *= FIXED_CHAR_SEL;
3091                 }
3092         }
3093         /* Could get sel > 1 if multiple wildcards */
3094         if (sel > 1.0)
3095                 sel = 1.0;
3096         return sel;
3097 }
3098
3099 static Selectivity
3100 regex_selectivity(char *patt, bool case_insensitive)
3101 {
3102         Selectivity sel;
3103         int                     pattlen = strlen(patt);
3104
3105         /* If patt doesn't end with $, consider it to have a trailing wildcard */
3106         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
3107                 (pattlen == 1 || patt[pattlen - 2] != '\\'))
3108         {
3109                 /* has trailing $ */
3110                 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
3111         }
3112         else
3113         {
3114                 /* no trailing $ */
3115                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
3116                 sel *= FULL_WILDCARD_SEL;
3117                 if (sel > 1.0)
3118                         sel = 1.0;
3119         }
3120         return sel;
3121 }
3122
3123 static Selectivity
3124 pattern_selectivity(char *patt, Pattern_Type ptype)
3125 {
3126         Selectivity result;
3127
3128         switch (ptype)
3129         {
3130                 case Pattern_Type_Like:
3131                         result = like_selectivity(patt, false);
3132                         break;
3133                 case Pattern_Type_Like_IC:
3134                         result = like_selectivity(patt, true);
3135                         break;
3136                 case Pattern_Type_Regex:
3137                         result = regex_selectivity(patt, false);
3138                         break;
3139                 case Pattern_Type_Regex_IC:
3140                         result = regex_selectivity(patt, true);
3141                         break;
3142                 default:
3143                         elog(ERROR, "pattern_selectivity: bogus ptype");
3144                         result = 1.0;           /* keep compiler quiet */
3145                         break;
3146         }
3147         return result;
3148 }
3149
3150 /*
3151  * Test whether the database's LOCALE setting is safe for LIKE/regexp index
3152  * optimization.  The key requirement here is that given a prefix string,
3153  * say "foo", we must be able to generate another string "fop" that is
3154  * greater than all strings "foobar" starting with "foo".  Unfortunately,
3155  * many non-C locales have bizarre collation rules in which "fop" > "foo"
3156  * is not sufficient to ensure "fop" > "foobar".  Until we can come up
3157  * with a more bulletproof way of generating the upper-bound string,
3158  * disable the optimization in locales where it is not known to be safe.
3159  */
3160 bool
3161 locale_is_like_safe(void)
3162 {
3163 #ifdef USE_LOCALE
3164         /* Cache result so we only have to compute it once */
3165         static int      result = -1;
3166         char       *localeptr;
3167
3168         if (result >= 0)
3169                 return (bool) result;
3170         localeptr = setlocale(LC_COLLATE, NULL);
3171         if (!localeptr)
3172                 elog(STOP, "Invalid LC_COLLATE setting");
3173
3174         /*
3175          * Currently we accept only "C" and "POSIX" (do any systems still
3176          * return "POSIX"?).  Which other locales allow safe optimization?
3177          */
3178         if (strcmp(localeptr, "C") == 0)
3179                 result = true;
3180         else if (strcmp(localeptr, "POSIX") == 0)
3181                 result = true;
3182         else
3183                 result = false;
3184         return (bool) result;
3185 #else                                                   /* not USE_LOCALE */
3186         return true;                            /* We must be in C locale, which is OK */
3187 #endif   /* USE_LOCALE */
3188 }
3189
3190 /*
3191  * Try to generate a string greater than the given string or any string it is
3192  * a prefix of.  If successful, return a palloc'd string; else return NULL.
3193  *
3194  * To work correctly in non-ASCII locales with weird collation orders,
3195  * we cannot simply increment "foo" to "fop" --- we have to check whether
3196  * we actually produced a string greater than the given one.  If not,
3197  * increment the righthand byte again and repeat.  If we max out the righthand
3198  * byte, truncate off the last character and start incrementing the next.
3199  * For example, if "z" were the last character in the sort order, then we
3200  * could produce "foo" as a string greater than "fonz".
3201  *
3202  * This could be rather slow in the worst case, but in most cases we won't
3203  * have to try more than one or two strings before succeeding.
3204  *
3205  * XXX this is actually not sufficient, since it only copes with the case
3206  * where individual characters collate in an order different from their
3207  * numeric code assignments.  It does not handle cases where there are
3208  * cross-character effects, such as specially sorted digraphs, multiple
3209  * sort passes, etc.  For now, we just shut down the whole thing in locales
3210  * that do such things :-(
3211  */
3212 char *
3213 make_greater_string(const char *str, Oid datatype)
3214 {
3215         char       *workstr;
3216         int                     len;
3217
3218         /*
3219          * Make a modifiable copy, which will be our return value if
3220          * successful
3221          */
3222         workstr = pstrdup((char *) str);
3223
3224         while ((len = strlen(workstr)) > 0)
3225         {
3226                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
3227
3228                 /*
3229                  * Try to generate a larger string by incrementing the last byte.
3230                  */
3231                 while (*lastchar < (unsigned char) 255)
3232                 {
3233                         (*lastchar)++;
3234                         if (string_lessthan(str, workstr, datatype))
3235                                 return workstr; /* Success! */
3236                 }
3237
3238                 /*
3239                  * Truncate off the last character, which might be more than 1
3240                  * byte in MULTIBYTE case.
3241                  */
3242 #ifdef MULTIBYTE
3243                 len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
3244                 workstr[len] = '\0';
3245 #else
3246                 *lastchar = '\0';
3247 #endif
3248         }
3249
3250         /* Failed... */
3251         pfree(workstr);
3252         return NULL;
3253 }
3254
3255 /*
3256  * Test whether two strings are "<" according to the rules of the given
3257  * datatype.  We do this the hard way, ie, actually calling the type's
3258  * "<" operator function, to ensure we get the right result...
3259  */
3260 static bool
3261 string_lessthan(const char *str1, const char *str2, Oid datatype)
3262 {
3263         Datum           datum1 = string_to_datum(str1, datatype);
3264         Datum           datum2 = string_to_datum(str2, datatype);
3265         bool            result;
3266
3267         switch (datatype)
3268         {
3269                 case TEXTOID:
3270                         result = DatumGetBool(DirectFunctionCall2(text_lt,
3271                                                                                                           datum1, datum2));
3272                         break;
3273
3274                 case BPCHAROID:
3275                         result = DatumGetBool(DirectFunctionCall2(bpcharlt,
3276                                                                                                           datum1, datum2));
3277                         break;
3278
3279                 case VARCHAROID:
3280                         result = DatumGetBool(DirectFunctionCall2(varcharlt,
3281                                                                                                           datum1, datum2));
3282                         break;
3283
3284                 case NAMEOID:
3285                         result = DatumGetBool(DirectFunctionCall2(namelt,
3286                                                                                                           datum1, datum2));
3287                         break;
3288
3289                 case BYTEAOID:
3290                         result = DatumGetBool(DirectFunctionCall2(bytealt,
3291                                                                                                           datum1, datum2));
3292                         break;
3293
3294                 default:
3295                         elog(ERROR, "string_lessthan: unexpected datatype %u", datatype);
3296                         result = false;
3297                         break;
3298         }
3299
3300         pfree(DatumGetPointer(datum1));
3301         pfree(DatumGetPointer(datum2));
3302
3303         return result;
3304 }
3305
3306 /* See if there is a binary op of the given name for the given datatype */
3307 static Oid
3308 find_operator(const char *opname, Oid datatype)
3309 {
3310         return GetSysCacheOid(OPERNAME,
3311                                                   PointerGetDatum(opname),
3312                                                   ObjectIdGetDatum(datatype),
3313                                                   ObjectIdGetDatum(datatype),
3314                                                   CharGetDatum('b'));
3315 }
3316
3317 /*
3318  * Generate a Datum of the appropriate type from a C string.
3319  * Note that all of the supported types are pass-by-ref, so the
3320  * returned value should be pfree'd if no longer needed.
3321  */
3322 static Datum
3323 string_to_datum(const char *str, Oid datatype)
3324 {
3325         /*
3326          * We cheat a little by assuming that textin() will do for bpchar and
3327          * varchar constants too...
3328          */
3329         if (datatype == NAMEOID)
3330                 return DirectFunctionCall1(namein, CStringGetDatum(str));
3331         else
3332                 return DirectFunctionCall1(textin, CStringGetDatum(str));
3333 }
3334
3335 /*
3336  * Generate a Const node of the appropriate type from a C string.
3337  */
3338 static Const *
3339 string_to_const(const char *str, Oid datatype)
3340 {
3341         Datum           conval = string_to_datum(str, datatype);
3342
3343         return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
3344                                          conval, false, false, false, false);
3345 }
3346
3347 /*-------------------------------------------------------------------------
3348  *
3349  * Index cost estimation functions
3350  *
3351  * genericcostestimate is a general-purpose estimator for use when we
3352  * don't have any better idea about how to estimate.  Index-type-specific
3353  * knowledge can be incorporated in the type-specific routines.
3354  *
3355  *-------------------------------------------------------------------------
3356  */
3357
3358 static void
3359 genericcostestimate(Query *root, RelOptInfo *rel,
3360                                         IndexOptInfo *index, List *indexQuals,
3361                                         Cost *indexStartupCost,
3362                                         Cost *indexTotalCost,
3363                                         Selectivity *indexSelectivity,
3364                                         double *indexCorrelation)
3365 {
3366         double          numIndexTuples;
3367         double          numIndexPages;
3368         List       *selectivityQuals = indexQuals;
3369
3370         /*
3371          * If the index is partial, AND the index predicate with the
3372          * explicitly given indexquals to produce a more accurate idea of the
3373          * index restriction.  This may produce redundant clauses, which we
3374          * hope that cnfify and clauselist_selectivity will deal with
3375          * intelligently.
3376          *
3377          * Note that index->indpred and indexQuals are both in implicit-AND form
3378          * to start with, which we have to make explicit to hand to
3379          * canonicalize_qual, and then we get back implicit-AND form again.
3380          */
3381         if (index->indpred != NIL)
3382         {
3383                 Expr       *andedQuals;
3384
3385                 andedQuals = make_ands_explicit(nconc(listCopy(index->indpred),
3386                                                                                           indexQuals));
3387                 selectivityQuals = canonicalize_qual(andedQuals, true);
3388         }
3389
3390         /* Estimate the fraction of main-table tuples that will be visited */
3391         *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
3392                                                                                            lfirsti(rel->relids));
3393
3394         /*
3395          * Estimate the number of tuples that will be visited.  We do it in
3396          * this rather peculiar-looking way in order to get the right answer
3397          * for partial indexes.  We can bound the number of tuples by the
3398          * index size, in any case.
3399          */
3400         numIndexTuples = *indexSelectivity * rel->tuples;
3401
3402         if (numIndexTuples > index->tuples)
3403                 numIndexTuples = index->tuples;
3404
3405         /*
3406          * Always estimate at least one tuple is touched, even when
3407          * indexSelectivity estimate is tiny.
3408          */
3409         if (numIndexTuples < 1.0)
3410                 numIndexTuples = 1.0;
3411
3412         /*
3413          * Estimate the number of index pages that will be retrieved.
3414          *
3415          * For all currently-supported index types, the first page of the index
3416          * is a metadata page, and we should figure on fetching that plus a
3417          * pro-rated fraction of the remaining pages.
3418          */
3419         if (index->pages > 1 && index->tuples > 0)
3420         {
3421                 numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
3422                 numIndexPages += 1;             /* count the metapage too */
3423                 numIndexPages = ceil(numIndexPages);
3424         }
3425         else
3426                 numIndexPages = 1.0;
3427
3428         /*
3429          * Compute the index access cost.
3430          *
3431          * Our generic assumption is that the index pages will be read
3432          * sequentially, so they have cost 1.0 each, not random_page_cost.
3433          * Also, we charge for evaluation of the indexquals at each index
3434          * tuple. All the costs are assumed to be paid incrementally during
3435          * the scan.
3436          */
3437         *indexStartupCost = 0;
3438         *indexTotalCost = numIndexPages +
3439                 (cpu_index_tuple_cost + cost_qual_eval(indexQuals)) * numIndexTuples;
3440
3441         /*
3442          * Generic assumption about index correlation: there isn't any.
3443          */
3444         *indexCorrelation = 0.0;
3445 }
3446
3447
3448 Datum
3449 btcostestimate(PG_FUNCTION_ARGS)
3450 {
3451         Query      *root = (Query *) PG_GETARG_POINTER(0);
3452         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3453         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3454         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3455         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3456         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3457         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3458         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3459
3460         genericcostestimate(root, rel, index, indexQuals,
3461                                                 indexStartupCost, indexTotalCost,
3462                                                 indexSelectivity, indexCorrelation);
3463
3464         /*
3465          * If it's a functional index, leave the default zero-correlation
3466          * estimate in place.  If not, and if we can get an estimate for the
3467          * first variable's ordering correlation C from pg_statistic, estimate
3468          * the index correlation as C / number-of-columns. (The idea here is
3469          * that multiple columns dilute the importance of the first column's
3470          * ordering, but don't negate it entirely.)
3471          */
3472         if (index->indproc == InvalidOid)
3473         {
3474                 Oid                     relid;
3475                 HeapTuple       tuple;
3476
3477                 relid = getrelid(lfirsti(rel->relids), root->rtable);
3478                 Assert(relid != InvalidOid);
3479                 tuple = SearchSysCache(STATRELATT,
3480                                                            ObjectIdGetDatum(relid),
3481                                                            Int16GetDatum(index->indexkeys[0]),
3482                                                            0, 0);
3483                 if (HeapTupleIsValid(tuple))
3484                 {
3485                         Oid                     typid;
3486                         int32           typmod;
3487                         float4     *numbers;
3488                         int                     nnumbers;
3489
3490                         get_atttypetypmod(relid, index->indexkeys[0],
3491                                                           &typid, &typmod);
3492                         if (get_attstatsslot(tuple, typid, typmod,
3493                                                                  STATISTIC_KIND_CORRELATION,
3494                                                                  index->ordering[0],
3495                                                                  NULL, NULL, &numbers, &nnumbers))
3496                         {
3497                                 double          varCorrelation;
3498                                 int                     nKeys;
3499
3500                                 Assert(nnumbers == 1);
3501                                 varCorrelation = numbers[0];
3502                                 for (nKeys = 1; index->indexkeys[nKeys] != 0; nKeys++)
3503                                          /* skip */ ;
3504
3505                                 *indexCorrelation = varCorrelation / nKeys;
3506
3507                                 free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
3508                         }
3509                         ReleaseSysCache(tuple);
3510                 }
3511         }
3512
3513         PG_RETURN_VOID();
3514 }
3515
3516 Datum
3517 rtcostestimate(PG_FUNCTION_ARGS)
3518 {
3519         Query      *root = (Query *) PG_GETARG_POINTER(0);
3520         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3521         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3522         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3523         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3524         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3525         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3526         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3527
3528         genericcostestimate(root, rel, index, indexQuals,
3529                                                 indexStartupCost, indexTotalCost,
3530                                                 indexSelectivity, indexCorrelation);
3531
3532         PG_RETURN_VOID();
3533 }
3534
3535 Datum
3536 hashcostestimate(PG_FUNCTION_ARGS)
3537 {
3538         Query      *root = (Query *) PG_GETARG_POINTER(0);
3539         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3540         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3541         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3542         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3543         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3544         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3545         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3546
3547         genericcostestimate(root, rel, index, indexQuals,
3548                                                 indexStartupCost, indexTotalCost,
3549                                                 indexSelectivity, indexCorrelation);
3550
3551         PG_RETURN_VOID();
3552 }
3553
3554 Datum
3555 gistcostestimate(PG_FUNCTION_ARGS)
3556 {
3557         Query      *root = (Query *) PG_GETARG_POINTER(0);
3558         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3559         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3560         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3561         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3562         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3563         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3564         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3565
3566         genericcostestimate(root, rel, index, indexQuals,
3567                                                 indexStartupCost, indexTotalCost,
3568                                                 indexSelectivity, indexCorrelation);
3569
3570         PG_RETURN_VOID();
3571 }