granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * selfuncs.c
   4  *        Selectivity functions and index cost estimation functions for
   5  *        standard operators and index access methods.
   6  *
   7  *        Selectivity routines are registered in the pg_operator catalog
   8  *        in the "oprrest" and "oprjoin" attributes.
   9  *
  10  *        Index cost functions are registered in the pg_am catalog
  11  *        in the "amcostestimate" attribute.
  12  *
  13  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  14  * Portions Copyright (c) 1994, Regents of the University of California
  15  *
  16  *
  17  * IDENTIFICATION
  18  *        $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.129 2003/01/24 03:58:43 tgl Exp $
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22
  23 /*----------
  24  * Operator selectivity estimation functions are called to estimate the
  25  * selectivity of WHERE clauses whose top-level operator is their operator.
  26  * We divide the problem into two cases:
  27  *              Restriction clause estimation: the clause involves vars of just
  28  *                      one relation.
  29  *              Join clause estimation: the clause involves vars of multiple rels.
  30  * Join selectivity estimation is far more difficult and usually less accurate
  31  * than restriction estimation.
  32  *
  33  * When dealing with the inner scan of a nestloop join, we consider the
  34  * join's joinclauses as restriction clauses for the inner relation, and
  35  * treat vars of the outer relation as parameters (a/k/a constants of unknown
  36  * values).  So, restriction estimators need to be able to accept an argument
  37  * telling which relation is to be treated as the variable.
  38  *
  39  * The call convention for a restriction estimator (oprrest function) is
  40  *
  41  *              Selectivity oprrest (Query *root,
  42  *                                                       Oid operator,
  43  *                                                       List *args,
  44  *                                                       int varRelid);
  45  *
  46  * root: general information about the query (rtable and RelOptInfo lists
  47  * are particularly important for the estimator).
  48  * operator: OID of the specific operator in question.
  49  * args: argument list from the operator clause.
  50  * varRelid: if not zero, the relid (rtable index) of the relation to
  51  * be treated as the variable relation.  May be zero if the args list
  52  * is known to contain vars of only one relation.
  53  *
  54  * This is represented at the SQL level (in pg_proc) as
  55  *
  56  *              float8 oprrest (internal, oid, internal, int4);
  57  *
  58  * The call convention for a join estimator (oprjoin function) is similar
  59  * except that varRelid is not needed:
  60  *
  61  *              Selectivity oprjoin (Query *root,
  62  *                                                       Oid operator,
  63  *                                                       List *args);
  64  *
  65  *              float8 oprjoin (internal, oid, internal);
  66  *----------
  67  */
  68
  69 #include "postgres.h"
  70
  71 #include <ctype.h>
  72 #include <math.h>
  73
  74 #include "access/heapam.h"
  75 #include "access/tuptoaster.h"
  76 #include "catalog/catname.h"
  77 #include "catalog/pg_namespace.h"
  78 #include "catalog/pg_operator.h"
  79 #include "catalog/pg_proc.h"
  80 #include "catalog/pg_statistic.h"
  81 #include "catalog/pg_type.h"
  82 #include "mb/pg_wchar.h"
  83 #include "nodes/makefuncs.h"
  84 #include "optimizer/clauses.h"
  85 #include "optimizer/cost.h"
  86 #include "optimizer/pathnode.h"
  87 #include "optimizer/paths.h"
  88 #include "optimizer/plancat.h"
  89 #include "optimizer/prep.h"
  90 #include "optimizer/tlist.h"
  91 #include "optimizer/var.h"
  92 #include "parser/parse_func.h"
  93 #include "parser/parse_oper.h"
  94 #include "parser/parsetree.h"
  95 #include "utils/builtins.h"
  96 #include "utils/date.h"
  97 #include "utils/datum.h"
  98 #include "utils/int8.h"
  99 #include "utils/lsyscache.h"
 100 #include "utils/pg_locale.h"
 101 #include "utils/selfuncs.h"
 102 #include "utils/syscache.h"
 103
 104
 105 /*
 106  * Note: the default selectivity estimates are not chosen entirely at random.
 107  * We want them to be small enough to ensure that indexscans will be used if
 108  * available, for typical table densities of ~100 tuples/page.  Thus, for
 109  * example, 0.01 is not quite small enough, since that makes it appear that
 110  * nearly all pages will be hit anyway.  Also, since we sometimes estimate
 111  * eqsel as 1/num_distinct, we probably want DEFAULT_NUM_DISTINCT to equal
 112  * 1/DEFAULT_EQ_SEL.
 113  */
 114
 115 /* default selectivity estimate for equalities such as "A = b" */
 116 #define DEFAULT_EQ_SEL  0.005
 117
 118 /* default selectivity estimate for inequalities such as "A < b" */
 119 #define DEFAULT_INEQ_SEL  (1.0 / 3.0)
 120
 121 /* default selectivity estimate for pattern-match operators such as LIKE */
 122 #define DEFAULT_MATCH_SEL       0.005
 123
 124 /* default number of distinct values in a table */
 125 #define DEFAULT_NUM_DISTINCT  200
 126
 127 /* default selectivity estimate for boolean and null test nodes */
 128 #define DEFAULT_UNK_SEL                 0.005
 129 #define DEFAULT_NOT_UNK_SEL             (1.0 - DEFAULT_UNK_SEL)
 130 #define DEFAULT_BOOL_SEL                0.5
 131
 132 /*
 133  * Clamp a computed probability estimate (which may suffer from roundoff or
 134  * estimation errors) to valid range.  Argument must be a float variable.
 135  */
 136 #define CLAMP_PROBABILITY(p) \
 137         do { \
 138                 if (p < 0.0) \
 139                         p = 0.0; \
 140                 else if (p > 1.0) \
 141                         p = 1.0; \
 142         } while (0)
 143
 144
 145 static bool get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max);
 146 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 147                                   Datum lobound, Datum hibound, Oid boundstypid,
 148                                   double *scaledlobound, double *scaledhibound);
 149 static double convert_numeric_to_scalar(Datum value, Oid typid);
 150 static void convert_string_to_scalar(unsigned char *value,
 151                                                  double *scaledvalue,
 152                                                  unsigned char *lobound,
 153                                                  double *scaledlobound,
 154                                                  unsigned char *hibound,
 155                                                  double *scaledhibound);
 156 static void convert_bytea_to_scalar(Datum value,
 157                                                 double *scaledvalue,
 158                                                 Datum lobound,
 159                                                 double *scaledlobound,
 160                                                 Datum hibound,
 161                                                 double *scaledhibound);
 162 static double convert_one_string_to_scalar(unsigned char *value,
 163                                                          int rangelo, int rangehi);
 164 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 165                                                         int rangelo, int rangehi);
 166 static unsigned char *convert_string_datum(Datum value, Oid typid);
 167 static double convert_timevalue_to_scalar(Datum value, Oid typid);
 168 static double get_att_numdistinct(Query *root, Var *var,
 169                                         Form_pg_statistic stats);
 170 static bool get_restriction_var(List *args, int varRelid,
 171                                         Var **var, Node **other,
 172                                         bool *varonleft);
 173 static void get_join_vars(List *args, Var **var1, Var **var2);
 174 static Selectivity prefix_selectivity(Query *root, Var *var, Const *prefix);
 175 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
 176 static bool string_lessthan(const char *str1, const char *str2,
 177                                 Oid datatype);
 178 static Oid      find_operator(const char *opname, Oid datatype);
 179 static Datum string_to_datum(const char *str, Oid datatype);
 180 static Const *string_to_const(const char *str, Oid datatype);
 181
 182
 183 /*
 184  *              eqsel                   - Selectivity of "=" for any data types.
 185  *
 186  * Note: this routine is also used to estimate selectivity for some
 187  * operators that are not "=" but have comparable selectivity behavior,
 188  * such as "~=" (geometric approximate-match).  Even for "=", we must
 189  * keep in mind that the left and right datatypes may differ.
 190  */
 191 Datum
 192 eqsel(PG_FUNCTION_ARGS)
 193 {
 194         Query      *root = (Query *) PG_GETARG_POINTER(0);
 195         Oid                     operator = PG_GETARG_OID(1);
 196         List       *args = (List *) PG_GETARG_POINTER(2);
 197         int                     varRelid = PG_GETARG_INT32(3);
 198         Var                *var;
 199         Node       *other;
 200         bool            varonleft;
 201         Oid                     relid;
 202         HeapTuple       statsTuple;
 203         Datum      *values;
 204         int                     nvalues;
 205         float4     *numbers;
 206         int                     nnumbers;
 207         double          selec;
 208
 209         /*
 210          * If expression is not var = something or something = var for a
 211          * simple var of a real relation (no subqueries, for now), then punt
 212          * and return a default estimate.
 213          */
 214         if (!get_restriction_var(args, varRelid,
 215                                                          &var, &other, &varonleft))
 216                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 217         relid = getrelid(var->varno, root->rtable);
 218         if (relid == InvalidOid)
 219                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 220
 221         /*
 222          * If the something is a NULL constant, assume operator is strict and
 223          * return zero, ie, operator will never return TRUE.
 224          */
 225         if (IsA(other, Const) &&((Const *) other)->constisnull)
 226                 PG_RETURN_FLOAT8(0.0);
 227
 228         /* get stats for the attribute, if available */
 229         statsTuple = SearchSysCache(STATRELATT,
 230                                                                 ObjectIdGetDatum(relid),
 231                                                                 Int16GetDatum(var->varattno),
 232                                                                 0, 0);
 233         if (HeapTupleIsValid(statsTuple))
 234         {
 235                 Form_pg_statistic stats;
 236
 237                 stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
 238
 239                 if (IsA(other, Const))
 240                 {
 241                         /* Var is being compared to a known non-null constant */
 242                         Datum           constval = ((Const *) other)->constvalue;
 243                         bool            match = false;
 244                         int                     i;
 245
 246                         /*
 247                          * Is the constant "=" to any of the column's most common
 248                          * values?      (Although the given operator may not really be
 249                          * "=", we will assume that seeing whether it returns TRUE is
 250                          * an appropriate test.  If you don't like this, maybe you
 251                          * shouldn't be using eqsel for your operator...)
 252                          */
 253                         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 254                                                                  STATISTIC_KIND_MCV, InvalidOid,
 255                                                                  &values, &nvalues,
 256                                                                  &numbers, &nnumbers))
 257                         {
 258                                 FmgrInfo        eqproc;
 259
 260                                 fmgr_info(get_opcode(operator), &eqproc);
 261
 262                                 for (i = 0; i < nvalues; i++)
 263                                 {
 264                                         /* be careful to apply operator right way 'round */
 265                                         if (varonleft)
 266                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 267                                                                                                                    values[i],
 268                                                                                                                    constval));
 269                                         else
 270                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 271                                                                                                                    constval,
 272                                                                                                                    values[i]));
 273                                         if (match)
 274                                                 break;
 275                                 }
 276                         }
 277                         else
 278                         {
 279                                 /* no most-common-value info available */
 280                                 values = NULL;
 281                                 numbers = NULL;
 282                                 i = nvalues = nnumbers = 0;
 283                         }
 284
 285                         if (match)
 286                         {
 287                                 /*
 288                                  * Constant is "=" to this common value.  We know
 289                                  * selectivity exactly (or as exactly as VACUUM could
 290                                  * calculate it, anyway).
 291                                  */
 292                                 selec = numbers[i];
 293                         }
 294                         else
 295                         {
 296                                 /*
 297                                  * Comparison is against a constant that is neither NULL
 298                                  * nor any of the common values.  Its selectivity cannot
 299                                  * be more than this:
 300                                  */
 301                                 double          sumcommon = 0.0;
 302                                 double          otherdistinct;
 303
 304                                 for (i = 0; i < nnumbers; i++)
 305                                         sumcommon += numbers[i];
 306                                 selec = 1.0 - sumcommon - stats->stanullfrac;
 307                                 CLAMP_PROBABILITY(selec);
 308
 309                                 /*
 310                                  * and in fact it's probably a good deal less. We
 311                                  * approximate that all the not-common values share this
 312                                  * remaining fraction equally, so we divide by the number
 313                                  * of other distinct values.
 314                                  */
 315                                 otherdistinct = get_att_numdistinct(root, var, stats)
 316                                         - nnumbers;
 317                                 if (otherdistinct > 1)
 318                                         selec /= otherdistinct;
 319
 320                                 /*
 321                                  * Another cross-check: selectivity shouldn't be estimated
 322                                  * as more than the least common "most common value".
 323                                  */
 324                                 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
 325                                         selec = numbers[nnumbers - 1];
 326                         }
 327
 328                         free_attstatsslot(var->vartype, values, nvalues,
 329                                                           numbers, nnumbers);
 330                 }
 331                 else
 332                 {
 333                         double          ndistinct;
 334
 335                         /*
 336                          * Search is for a value that we do not know a priori, but we
 337                          * will assume it is not NULL.  Estimate the selectivity as
 338                          * non-null fraction divided by number of distinct values, so
 339                          * that we get a result averaged over all possible values
 340                          * whether common or uncommon.  (Essentially, we are assuming
 341                          * that the not-yet-known comparison value is equally likely
 342                          * to be any of the possible values, regardless of their
 343                          * frequency in the table.      Is that a good idea?)
 344                          */
 345                         selec = 1.0 - stats->stanullfrac;
 346                         ndistinct = get_att_numdistinct(root, var, stats);
 347                         if (ndistinct > 1)
 348                                 selec /= ndistinct;
 349
 350                         /*
 351                          * Cross-check: selectivity should never be estimated as more
 352                          * than the most common value's.
 353                          */
 354                         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 355                                                                  STATISTIC_KIND_MCV, InvalidOid,
 356                                                                  NULL, NULL,
 357                                                                  &numbers, &nnumbers))
 358                         {
 359                                 if (nnumbers > 0 && selec > numbers[0])
 360                                         selec = numbers[0];
 361                                 free_attstatsslot(var->vartype, NULL, 0, numbers, nnumbers);
 362                         }
 363                 }
 364
 365                 ReleaseSysCache(statsTuple);
 366         }
 367         else
 368         {
 369                 /*
 370                  * No VACUUM ANALYZE stats available, so make a guess using
 371                  * estimated number of distinct values and assuming they are
 372                  * equally common.      (The guess is unlikely to be very good, but we
 373                  * do know a few special cases.)
 374                  */
 375                 selec = 1.0 / get_att_numdistinct(root, var, NULL);
 376         }
 377
 378         /* result should be in range, but make sure... */
 379         CLAMP_PROBABILITY(selec);
 380
 381         PG_RETURN_FLOAT8((float8) selec);
 382 }
 383
 384 /*
 385  *              neqsel                  - Selectivity of "!=" for any data types.
 386  *
 387  * This routine is also used for some operators that are not "!="
 388  * but have comparable selectivity behavior.  See above comments
 389  * for eqsel().
 390  */
 391 Datum
 392 neqsel(PG_FUNCTION_ARGS)
 393 {
 394         Query      *root = (Query *) PG_GETARG_POINTER(0);
 395         Oid                     operator = PG_GETARG_OID(1);
 396         List       *args = (List *) PG_GETARG_POINTER(2);
 397         int                     varRelid = PG_GETARG_INT32(3);
 398         Oid                     eqop;
 399         float8          result;
 400
 401         /*
 402          * We want 1 - eqsel() where the equality operator is the one
 403          * associated with this != operator, that is, its negator.
 404          */
 405         eqop = get_negator(operator);
 406         if (eqop)
 407         {
 408                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 409                                                                                                         PointerGetDatum(root),
 410                                                                                                   ObjectIdGetDatum(eqop),
 411                                                                                                         PointerGetDatum(args),
 412                                                                                            Int32GetDatum(varRelid)));
 413         }
 414         else
 415         {
 416                 /* Use default selectivity (should we raise an error instead?) */
 417                 result = DEFAULT_EQ_SEL;
 418         }
 419         result = 1.0 - result;
 420         PG_RETURN_FLOAT8(result);
 421 }
 422
 423 /*
 424  *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
 425  *
 426  * This is the guts of both scalarltsel and scalargtsel.  The caller has
 427  * commuted the clause, if necessary, so that we can treat the Var as
 428  * being on the left.  The caller must also make sure that the other side
 429  * of the clause is a non-null Const, and dissect same into a value and
 430  * datatype.
 431  *
 432  * This routine works for any datatype (or pair of datatypes) known to
 433  * convert_to_scalar().  If it is applied to some other datatype,
 434  * it will return a default estimate.
 435  */
 436 static double
 437 scalarineqsel(Query *root, Oid operator, bool isgt,
 438                           Var *var, Datum constval, Oid consttype)
 439 {
 440         Oid                     relid;
 441         HeapTuple       statsTuple;
 442         Form_pg_statistic stats;
 443         FmgrInfo        opproc;
 444         Datum      *values;
 445         int                     nvalues;
 446         float4     *numbers;
 447         int                     nnumbers;
 448         double          mcv_selec,
 449                                 hist_selec,
 450                                 sumcommon;
 451         double          selec;
 452         int                     i;
 453
 454         /*
 455          * If expression is not var op something or something op var for a
 456          * simple var of a real relation (no subqueries, for now), then punt
 457          * and return a default estimate.
 458          */
 459         relid = getrelid(var->varno, root->rtable);
 460         if (relid == InvalidOid)
 461                 return DEFAULT_INEQ_SEL;
 462
 463         /* get stats for the attribute */
 464         statsTuple = SearchSysCache(STATRELATT,
 465                                                                 ObjectIdGetDatum(relid),
 466                                                                 Int16GetDatum(var->varattno),
 467                                                                 0, 0);
 468         if (!HeapTupleIsValid(statsTuple))
 469         {
 470                 /* no stats available, so default result */
 471                 return DEFAULT_INEQ_SEL;
 472         }
 473         stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
 474
 475         fmgr_info(get_opcode(operator), &opproc);
 476
 477         /*
 478          * If we have most-common-values info, add up the fractions of the MCV
 479          * entries that satisfy MCV OP CONST.  These fractions contribute
 480          * directly to the result selectivity.  Also add up the total fraction
 481          * represented by MCV entries.
 482          */
 483         mcv_selec = 0.0;
 484         sumcommon = 0.0;
 485
 486         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 487                                                  STATISTIC_KIND_MCV, InvalidOid,
 488                                                  &values, &nvalues,
 489                                                  &numbers, &nnumbers))
 490         {
 491                 for (i = 0; i < nvalues; i++)
 492                 {
 493                         if (DatumGetBool(FunctionCall2(&opproc,
 494                                                                                    values[i],
 495                                                                                    constval)))
 496                                 mcv_selec += numbers[i];
 497                         sumcommon += numbers[i];
 498                 }
 499                 free_attstatsslot(var->vartype, values, nvalues, numbers, nnumbers);
 500         }
 501
 502         /*
 503          * If there is a histogram, determine which bin the constant falls in,
 504          * and compute the resulting contribution to selectivity.
 505          *
 506          * Someday, VACUUM might store more than one histogram per rel/att,
 507          * corresponding to more than one possible sort ordering defined for
 508          * the column type.  However, to make that work we will need to figure
 509          * out which staop to search for --- it's not necessarily the one we
 510          * have at hand!  (For example, we might have a '<=' operator rather
 511          * than the '<' operator that will appear in staop.)  For now, assume
 512          * that whatever appears in pg_statistic is sorted the same way our
 513          * operator sorts, or the reverse way if isgt is TRUE.
 514          */
 515         hist_selec = 0.0;
 516
 517         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
 518                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 519                                                  &values, &nvalues,
 520                                                  NULL, NULL))
 521         {
 522                 if (nvalues > 1)
 523                 {
 524                         double          histfrac;
 525                         bool            ltcmp;
 526
 527                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 528                                                                                            values[0],
 529                                                                                            constval));
 530                         if (isgt)
 531                                 ltcmp = !ltcmp;
 532                         if (!ltcmp)
 533                         {
 534                                 /* Constant is below lower histogram boundary. */
 535                                 histfrac = 0.0;
 536                         }
 537                         else
 538                         {
 539                                 /*
 540                                  * Scan to find proper location.  This could be made
 541                                  * faster by using a binary-search method, but it's
 542                                  * probably not worth the trouble for typical histogram
 543                                  * sizes.
 544                                  */
 545                                 for (i = 1; i < nvalues; i++)
 546                                 {
 547                                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 548                                                                                                            values[i],
 549                                                                                                            constval));
 550                                         if (isgt)
 551                                                 ltcmp = !ltcmp;
 552                                         if (!ltcmp)
 553                                                 break;
 554                                 }
 555                                 if (i >= nvalues)
 556                                 {
 557                                         /* Constant is above upper histogram boundary. */
 558                                         histfrac = 1.0;
 559                                 }
 560                                 else
 561                                 {
 562                                         double          val,
 563                                                                 high,
 564                                                                 low;
 565                                         double          binfrac;
 566
 567                                         /*
 568                                          * We have values[i-1] < constant < values[i].
 569                                          *
 570                                          * Convert the constant and the two nearest bin boundary
 571                                          * values to a uniform comparison scale, and do a
 572                                          * linear interpolation within this bin.
 573                                          */
 574                                         if (convert_to_scalar(constval, consttype, &val,
 575                                                                                   values[i - 1], values[i],
 576                                                                                   var->vartype,
 577                                                                                   &low, &high))
 578                                         {
 579                                                 if (high <= low)
 580                                                 {
 581                                                         /* cope if bin boundaries appear identical */
 582                                                         binfrac = 0.5;
 583                                                 }
 584                                                 else if (val <= low)
 585                                                         binfrac = 0.0;
 586                                                 else if (val >= high)
 587                                                         binfrac = 1.0;
 588                                                 else
 589                                                 {
 590                                                         binfrac = (val - low) / (high - low);
 591
 592                                                         /*
 593                                                          * Watch out for the possibility that we got a
 594                                                          * NaN or Infinity from the division.  This
 595                                                          * can happen despite the previous checks, if
 596                                                          * for example "low" is -Infinity.
 597                                                          */
 598                                                         if (isnan(binfrac) ||
 599                                                                 binfrac < 0.0 || binfrac > 1.0)
 600                                                                 binfrac = 0.5;
 601                                                 }
 602                                         }
 603                                         else
 604                                         {
 605                                                 /*
 606                                                  * Ideally we'd produce an error here, on the
 607                                                  * grounds that the given operator shouldn't have
 608                                                  * scalarXXsel registered as its selectivity func
 609                                                  * unless we can deal with its operand types.  But
 610                                                  * currently, all manner of stuff is invoking
 611                                                  * scalarXXsel, so give a default estimate until
 612                                                  * that can be fixed.
 613                                                  */
 614                                                 binfrac = 0.5;
 615                                         }
 616
 617                                         /*
 618                                          * Now, compute the overall selectivity across the
 619                                          * values represented by the histogram.  We have i-1
 620                                          * full bins and binfrac partial bin below the
 621                                          * constant.
 622                                          */
 623                                         histfrac = (double) (i - 1) + binfrac;
 624                                         histfrac /= (double) (nvalues - 1);
 625                                 }
 626                         }
 627
 628                         /*
 629                          * Now histfrac = fraction of histogram entries below the
 630                          * constant.
 631                          *
 632                          * Account for "<" vs ">"
 633                          */
 634                         hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 635
 636                         /*
 637                          * The histogram boundaries are only approximate to begin
 638                          * with, and may well be out of date anyway.  Therefore, don't
 639                          * believe extremely small or large selectivity estimates.
 640                          */
 641                         if (hist_selec < 0.0001)
 642                                 hist_selec = 0.0001;
 643                         else if (hist_selec > 0.9999)
 644                                 hist_selec = 0.9999;
 645                 }
 646
 647                 free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
 648         }
 649
 650         /*
 651          * Now merge the results from the MCV and histogram calculations,
 652          * realizing that the histogram covers only the non-null values that
 653          * are not listed in MCV.
 654          */
 655         selec = 1.0 - stats->stanullfrac - sumcommon;
 656
 657         if (hist_selec > 0.0)
 658                 selec *= hist_selec;
 659         else
 660         {
 661                 /*
 662                  * If no histogram but there are values not accounted for by MCV,
 663                  * arbitrarily assume half of them will match.
 664                  */
 665                 selec *= 0.5;
 666         }
 667
 668         selec += mcv_selec;
 669
 670         ReleaseSysCache(statsTuple);
 671
 672         /* result should be in range, but make sure... */
 673         CLAMP_PROBABILITY(selec);
 674
 675         return selec;
 676 }
 677
 678 /*
 679  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 680  */
 681 Datum
 682 scalarltsel(PG_FUNCTION_ARGS)
 683 {
 684         Query      *root = (Query *) PG_GETARG_POINTER(0);
 685         Oid                     operator = PG_GETARG_OID(1);
 686         List       *args = (List *) PG_GETARG_POINTER(2);
 687         int                     varRelid = PG_GETARG_INT32(3);
 688         Var                *var;
 689         Node       *other;
 690         Datum           constval;
 691         Oid                     consttype;
 692         bool            varonleft;
 693         bool            isgt;
 694         double          selec;
 695
 696         /*
 697          * If expression is not var op something or something op var for a
 698          * simple var of a real relation (no subqueries, for now), then punt
 699          * and return a default estimate.
 700          */
 701         if (!get_restriction_var(args, varRelid,
 702                                                          &var, &other, &varonleft))
 703                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 704
 705         /*
 706          * Can't do anything useful if the something is not a constant,
 707          * either.
 708          */
 709         if (!IsA(other, Const))
 710                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 711
 712         /*
 713          * If the constant is NULL, assume operator is strict and return zero,
 714          * ie, operator will never return TRUE.
 715          */
 716         if (((Const *) other)->constisnull)
 717                 PG_RETURN_FLOAT8(0.0);
 718         constval = ((Const *) other)->constvalue;
 719         consttype = ((Const *) other)->consttype;
 720
 721         /*
 722          * Force the var to be on the left to simplify logic in scalarineqsel.
 723          */
 724         if (varonleft)
 725         {
 726                 /* we have var < other */
 727                 isgt = false;
 728         }
 729         else
 730         {
 731                 /* we have other < var, commute to make var > other */
 732                 operator = get_commutator(operator);
 733                 if (!operator)
 734                 {
 735                         /* Use default selectivity (should we raise an error instead?) */
 736                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 737                 }
 738                 isgt = true;
 739         }
 740
 741         selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
 742
 743         PG_RETURN_FLOAT8((float8) selec);
 744 }
 745
 746 /*
 747  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 748  */
 749 Datum
 750 scalargtsel(PG_FUNCTION_ARGS)
 751 {
 752         Query      *root = (Query *) PG_GETARG_POINTER(0);
 753         Oid                     operator = PG_GETARG_OID(1);
 754         List       *args = (List *) PG_GETARG_POINTER(2);
 755         int                     varRelid = PG_GETARG_INT32(3);
 756         Var                *var;
 757         Node       *other;
 758         Datum           constval;
 759         Oid                     consttype;
 760         bool            varonleft;
 761         bool            isgt;
 762         double          selec;
 763
 764         /*
 765          * If expression is not var op something or something op var for a
 766          * simple var of a real relation (no subqueries, for now), then punt
 767          * and return a default estimate.
 768          */
 769         if (!get_restriction_var(args, varRelid,
 770                                                          &var, &other, &varonleft))
 771                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 772
 773         /*
 774          * Can't do anything useful if the something is not a constant,
 775          * either.
 776          */
 777         if (!IsA(other, Const))
 778                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 779
 780         /*
 781          * If the constant is NULL, assume operator is strict and return zero,
 782          * ie, operator will never return TRUE.
 783          */
 784         if (((Const *) other)->constisnull)
 785                 PG_RETURN_FLOAT8(0.0);
 786         constval = ((Const *) other)->constvalue;
 787         consttype = ((Const *) other)->consttype;
 788
 789         /*
 790          * Force the var to be on the left to simplify logic in scalarineqsel.
 791          */
 792         if (varonleft)
 793         {
 794                 /* we have var > other */
 795                 isgt = true;
 796         }
 797         else
 798         {
 799                 /* we have other > var, commute to make var < other */
 800                 operator = get_commutator(operator);
 801                 if (!operator)
 802                 {
 803                         /* Use default selectivity (should we raise an error instead?) */
 804                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 805                 }
 806                 isgt = false;
 807         }
 808
 809         selec = scalarineqsel(root, operator, isgt, var, constval, consttype);
 810
 811         PG_RETURN_FLOAT8((float8) selec);
 812 }
 813
 814 /*
 815  * patternsel                   - Generic code for pattern-match selectivity.
 816  */
 817 static double
 818 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 819 {
 820         Query      *root = (Query *) PG_GETARG_POINTER(0);
 821
 822 #ifdef NOT_USED
 823         Oid                     operator = PG_GETARG_OID(1);
 824 #endif
 825         List       *args = (List *) PG_GETARG_POINTER(2);
 826         int                     varRelid = PG_GETARG_INT32(3);
 827         Var                *var;
 828         Node       *other;
 829         bool            varonleft;
 830         Oid                     relid;
 831         Datum           constval;
 832         Pattern_Prefix_Status pstatus;
 833         Const      *patt = NULL;
 834         Const      *prefix = NULL;
 835         Const      *rest = NULL;
 836         double          result;
 837
 838         /*
 839          * If expression is not var op constant for a simple var of a real
 840          * relation (no subqueries, for now), then punt and return a default
 841          * estimate.
 842          */
 843         if (!get_restriction_var(args, varRelid,
 844                                                          &var, &other, &varonleft))
 845                 return DEFAULT_MATCH_SEL;
 846         if (!varonleft || !IsA(other, Const))
 847                 return DEFAULT_MATCH_SEL;
 848         relid = getrelid(var->varno, root->rtable);
 849         if (relid == InvalidOid)
 850                 return DEFAULT_MATCH_SEL;
 851
 852         /*
 853          * If the constant is NULL, assume operator is strict and return zero,
 854          * ie, operator will never return TRUE.
 855          */
 856         if (((Const *) other)->constisnull)
 857                 return 0.0;
 858         constval = ((Const *) other)->constvalue;
 859
 860         /*
 861          * the right-hand const is type text or bytea for all supported
 862          * operators
 863          */
 864         Assert(((Const *) other)->consttype == TEXTOID ||
 865                    ((Const *) other)->consttype == BYTEAOID);
 866
 867         /* divide pattern into fixed prefix and remainder */
 868         patt = (Const *) other;
 869         pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
 870
 871         if (pstatus == Pattern_Prefix_Exact)
 872         {
 873                 /*
 874                  * Pattern specifies an exact match, so pretend operator is '='
 875                  */
 876                 Oid                     eqopr = find_operator("=", var->vartype);
 877                 List       *eqargs;
 878
 879                 if (eqopr == InvalidOid)
 880                         elog(ERROR, "patternsel: no = operator for type %u",
 881                                  var->vartype);
 882                 eqargs = makeList2(var, prefix);
 883                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 884                                                                                                         PointerGetDatum(root),
 885                                                                                                  ObjectIdGetDatum(eqopr),
 886                                                                                                  PointerGetDatum(eqargs),
 887                                                                                            Int32GetDatum(varRelid)));
 888         }
 889         else
 890         {
 891                 /*
 892                  * Not exact-match pattern.  We estimate selectivity of the fixed
 893                  * prefix and remainder of pattern separately, then combine the
 894                  * two.
 895                  */
 896                 Selectivity prefixsel;
 897                 Selectivity restsel;
 898                 Selectivity selec;
 899
 900                 if (pstatus == Pattern_Prefix_Partial)
 901                         prefixsel = prefix_selectivity(root, var, prefix);
 902                 else
 903                         prefixsel = 1.0;
 904                 restsel = pattern_selectivity(rest, ptype);
 905                 selec = prefixsel * restsel;
 906                 /* result should be in range, but make sure... */
 907                 CLAMP_PROBABILITY(selec);
 908                 result = selec;
 909         }
 910
 911         if (prefix)
 912         {
 913                 pfree(DatumGetPointer(prefix->constvalue));
 914                 pfree(prefix);
 915         }
 916
 917         return result;
 918 }
 919
 920 /*
 921  *              regexeqsel              - Selectivity of regular-expression pattern match.
 922  */
 923 Datum
 924 regexeqsel(PG_FUNCTION_ARGS)
 925 {
 926         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
 927 }
 928
 929 /*
 930  *              icregexeqsel    - Selectivity of case-insensitive regex match.
 931  */
 932 Datum
 933 icregexeqsel(PG_FUNCTION_ARGS)
 934 {
 935         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
 936 }
 937
 938 /*
 939  *              likesel                 - Selectivity of LIKE pattern match.
 940  */
 941 Datum
 942 likesel(PG_FUNCTION_ARGS)
 943 {
 944         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
 945 }
 946
 947 /*
 948  *              iclikesel                       - Selectivity of ILIKE pattern match.
 949  */
 950 Datum
 951 iclikesel(PG_FUNCTION_ARGS)
 952 {
 953         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
 954 }
 955
 956 /*
 957  *              regexnesel              - Selectivity of regular-expression pattern non-match.
 958  */
 959 Datum
 960 regexnesel(PG_FUNCTION_ARGS)
 961 {
 962         double          result;
 963
 964         result = patternsel(fcinfo, Pattern_Type_Regex);
 965         result = 1.0 - result;
 966         PG_RETURN_FLOAT8(result);
 967 }
 968
 969 /*
 970  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
 971  */
 972 Datum
 973 icregexnesel(PG_FUNCTION_ARGS)
 974 {
 975         double          result;
 976
 977         result = patternsel(fcinfo, Pattern_Type_Regex_IC);
 978         result = 1.0 - result;
 979         PG_RETURN_FLOAT8(result);
 980 }
 981
 982 /*
 983  *              nlikesel                - Selectivity of LIKE pattern non-match.
 984  */
 985 Datum
 986 nlikesel(PG_FUNCTION_ARGS)
 987 {
 988         double          result;
 989
 990         result = patternsel(fcinfo, Pattern_Type_Like);
 991         result = 1.0 - result;
 992         PG_RETURN_FLOAT8(result);
 993 }
 994
 995 /*
 996  *              icnlikesel              - Selectivity of ILIKE pattern non-match.
 997  */
 998 Datum
 999 icnlikesel(PG_FUNCTION_ARGS)
1000 {
1001         double          result;
1002
1003         result = patternsel(fcinfo, Pattern_Type_Like_IC);
1004         result = 1.0 - result;
1005         PG_RETURN_FLOAT8(result);
1006 }
1007
1008 /*
1009  *              booltestsel             - Selectivity of BooleanTest Node.
1010  */
1011 Selectivity
1012 booltestsel(Query *root, BoolTestType booltesttype, Node *arg, int varRelid)
1013 {
1014         Var                *var;
1015         Oid                     relid;
1016         HeapTuple       statsTuple;
1017         Datum      *values;
1018         int                     nvalues;
1019         float4     *numbers;
1020         int                     nnumbers;
1021         double          selec;
1022
1023         /*
1024          * Ignore any binary-compatible relabeling (probably unnecessary, but
1025          * can't hurt)
1026          */
1027         if (IsA(arg, RelabelType))
1028                 arg = (Node *) ((RelabelType *) arg)->arg;
1029
1030         if (IsA(arg, Var) &&(varRelid == 0 || varRelid == ((Var *) arg)->varno))
1031                 var = (Var *) arg;
1032         else
1033         {
1034                 /*
1035                  * If argument is not a Var, we can't get statistics for it, but
1036                  * perhaps clause_selectivity can do something with it.  We ignore
1037                  * the possibility of a NULL value when using clause_selectivity,
1038                  * and just assume the value is either TRUE or FALSE.
1039                  */
1040                 switch (booltesttype)
1041                 {
1042                         case IS_UNKNOWN:
1043                                 selec = DEFAULT_UNK_SEL;
1044                                 break;
1045                         case IS_NOT_UNKNOWN:
1046                                 selec = DEFAULT_NOT_UNK_SEL;
1047                                 break;
1048                         case IS_TRUE:
1049                         case IS_NOT_FALSE:
1050                                 selec = (double) clause_selectivity(root, arg, varRelid);
1051                                 break;
1052                         case IS_FALSE:
1053                         case IS_NOT_TRUE:
1054                                 selec = 1.0 - (double) clause_selectivity(root, arg, varRelid);
1055                                 break;
1056                         default:
1057                                 elog(ERROR, "booltestsel: unexpected booltesttype %d",
1058                                          (int) booltesttype);
1059                                 selec = 0.0;    /* Keep compiler quiet */
1060                                 break;
1061                 }
1062                 return (Selectivity) selec;
1063         }
1064
1065         /* get stats for the attribute, if available */
1066         relid = getrelid(var->varno, root->rtable);
1067         if (relid == InvalidOid)
1068                 statsTuple = NULL;
1069         else
1070                 statsTuple = SearchSysCache(STATRELATT,
1071                                                                         ObjectIdGetDatum(relid),
1072                                                                         Int16GetDatum(var->varattno),
1073                                                                         0, 0);
1074
1075         if (HeapTupleIsValid(statsTuple))
1076         {
1077                 Form_pg_statistic stats;
1078                 double          freq_null;
1079
1080                 stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
1081
1082                 freq_null = stats->stanullfrac;
1083
1084                 if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
1085                                                          STATISTIC_KIND_MCV, InvalidOid,
1086                                                          &values, &nvalues,
1087                                                          &numbers, &nnumbers)
1088                         && nnumbers > 0)
1089                 {
1090                         double          freq_true;
1091                         double          freq_false;
1092
1093                         /*
1094                          * Get first MCV frequency and derive frequency for true.
1095                          */
1096                         if (DatumGetBool(values[0]))
1097                                 freq_true = numbers[0];
1098                         else
1099                                 freq_true = 1.0 - numbers[0] - freq_null;
1100
1101                         /*
1102                          * Next derive freqency for false. Then use these as
1103                          * appropriate to derive frequency for each case.
1104                          */
1105                         freq_false = 1.0 - freq_true - freq_null;
1106
1107                         switch (booltesttype)
1108                         {
1109                                 case IS_UNKNOWN:
1110                                         /* select only NULL values */
1111                                         selec = freq_null;
1112                                         break;
1113                                 case IS_NOT_UNKNOWN:
1114                                         /* select non-NULL values */
1115                                         selec = 1.0 - freq_null;
1116                                         break;
1117                                 case IS_TRUE:
1118                                         /* select only TRUE values */
1119                                         selec = freq_true;
1120                                         break;
1121                                 case IS_NOT_TRUE:
1122                                         /* select non-TRUE values */
1123                                         selec = 1.0 - freq_true;
1124                                         break;
1125                                 case IS_FALSE:
1126                                         /* select only FALSE values */
1127                                         selec = freq_false;
1128                                         break;
1129                                 case IS_NOT_FALSE:
1130                                         /* select non-FALSE values */
1131                                         selec = 1.0 - freq_false;
1132                                         break;
1133                                 default:
1134                                         elog(ERROR, "booltestsel: unexpected booltesttype %d",
1135                                                  (int) booltesttype);
1136                                         selec = 0.0;    /* Keep compiler quiet */
1137                                         break;
1138                         }
1139
1140                         free_attstatsslot(var->vartype, values, nvalues,
1141                                                           numbers, nnumbers);
1142                 }
1143                 else
1144                 {
1145                         /*
1146                          * No most-common-value info available. Still have null
1147                          * fraction information, so use it for IS [NOT] UNKNOWN.
1148                          * Otherwise adjust for null fraction and assume an even split
1149                          * for boolean tests.
1150                          */
1151                         switch (booltesttype)
1152                         {
1153                                 case IS_UNKNOWN:
1154
1155                                         /*
1156                                          * Use freq_null directly.
1157                                          */
1158                                         selec = freq_null;
1159                                         break;
1160                                 case IS_NOT_UNKNOWN:
1161
1162                                         /*
1163                                          * Select not unknown (not null) values. Calculate
1164                                          * from freq_null.
1165                                          */
1166                                         selec = 1.0 - freq_null;
1167                                         break;
1168                                 case IS_TRUE:
1169                                 case IS_NOT_TRUE:
1170                                 case IS_FALSE:
1171                                 case IS_NOT_FALSE:
1172                                         selec = (1.0 - freq_null) / 2.0;
1173                                         break;
1174                                 default:
1175                                         elog(ERROR, "booltestsel: unexpected booltesttype %d",
1176                                                  (int) booltesttype);
1177                                         selec = 0.0;    /* Keep compiler quiet */
1178                                         break;
1179                         }
1180                 }
1181
1182                 ReleaseSysCache(statsTuple);
1183         }
1184         else
1185         {
1186                 /*
1187                  * No VACUUM ANALYZE stats available, so use a default value.
1188                  * (Note: not much point in recursing to clause_selectivity here.)
1189                  */
1190                 switch (booltesttype)
1191                 {
1192                         case IS_UNKNOWN:
1193                                 selec = DEFAULT_UNK_SEL;
1194                                 break;
1195                         case IS_NOT_UNKNOWN:
1196                                 selec = DEFAULT_NOT_UNK_SEL;
1197                                 break;
1198                         case IS_TRUE:
1199                         case IS_NOT_TRUE:
1200                         case IS_FALSE:
1201                         case IS_NOT_FALSE:
1202                                 selec = DEFAULT_BOOL_SEL;
1203                                 break;
1204                         default:
1205                                 elog(ERROR, "booltestsel: unexpected booltesttype %d",
1206                                          (int) booltesttype);
1207                                 selec = 0.0;    /* Keep compiler quiet */
1208                                 break;
1209                 }
1210         }
1211
1212         /* result should be in range, but make sure... */
1213         CLAMP_PROBABILITY(selec);
1214
1215         return (Selectivity) selec;
1216 }
1217
1218 /*
1219  *              nulltestsel             - Selectivity of NullTest Node.
1220  */
1221 Selectivity
1222 nulltestsel(Query *root, NullTestType nulltesttype, Node *arg, int varRelid)
1223 {
1224         Var                *var;
1225         Oid                     relid;
1226         HeapTuple       statsTuple;
1227         double          selec;
1228         double          defselec;
1229         double          freq_null;
1230
1231         switch (nulltesttype)
1232         {
1233                 case IS_NULL:
1234                         defselec = DEFAULT_UNK_SEL;
1235                         break;
1236                 case IS_NOT_NULL:
1237                         defselec = DEFAULT_NOT_UNK_SEL;
1238                         break;
1239                 default:
1240                         elog(ERROR, "nulltestsel: unexpected nulltesttype %d",
1241                                  (int) nulltesttype);
1242                         return (Selectivity) 0;         /* keep compiler quiet */
1243         }
1244
1245         /*
1246          * Ignore any binary-compatible relabeling
1247          */
1248         if (IsA(arg, RelabelType))
1249                 arg = (Node *) ((RelabelType *) arg)->arg;
1250
1251         if (IsA(arg, Var) &&
1252                 (varRelid == 0 || varRelid == ((Var *) arg)->varno))
1253                 var = (Var *) arg;
1254         else
1255         {
1256                 /* punt if non-Var argument */
1257                 return (Selectivity) defselec;
1258         }
1259
1260         relid = getrelid(var->varno, root->rtable);
1261         if (relid == InvalidOid)
1262                 return (Selectivity) defselec;
1263
1264         /* get stats for the attribute, if available */
1265         statsTuple = SearchSysCache(STATRELATT,
1266                                                                 ObjectIdGetDatum(relid),
1267                                                                 Int16GetDatum(var->varattno),
1268                                                                 0, 0);
1269         if (HeapTupleIsValid(statsTuple))
1270         {
1271                 Form_pg_statistic stats;
1272
1273                 stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
1274                 freq_null = stats->stanullfrac;
1275
1276                 switch (nulltesttype)
1277                 {
1278                         case IS_NULL:
1279
1280                                 /*
1281                                  * Use freq_null directly.
1282                                  */
1283                                 selec = freq_null;
1284                                 break;
1285                         case IS_NOT_NULL:
1286
1287                                 /*
1288                                  * Select not unknown (not null) values. Calculate from
1289                                  * freq_null.
1290                                  */
1291                                 selec = 1.0 - freq_null;
1292                                 break;
1293                         default:
1294                                 elog(ERROR, "nulltestsel: unexpected nulltesttype %d",
1295                                          (int) nulltesttype);
1296                                 return (Selectivity) 0; /* keep compiler quiet */
1297                 }
1298
1299                 ReleaseSysCache(statsTuple);
1300         }
1301         else
1302         {
1303                 /*
1304                  * No VACUUM ANALYZE stats available, so make a guess
1305                  */
1306                 selec = defselec;
1307         }
1308
1309         /* result should be in range, but make sure... */
1310         CLAMP_PROBABILITY(selec);
1311
1312         return (Selectivity) selec;
1313 }
1314
1315 /*
1316  *              eqjoinsel               - Join selectivity of "="
1317  */
1318 Datum
1319 eqjoinsel(PG_FUNCTION_ARGS)
1320 {
1321         Query      *root = (Query *) PG_GETARG_POINTER(0);
1322         Oid                     operator = PG_GETARG_OID(1);
1323         List       *args = (List *) PG_GETARG_POINTER(2);
1324         Var                *var1;
1325         Var                *var2;
1326         double          selec;
1327
1328         get_join_vars(args, &var1, &var2);
1329
1330         if (var1 == NULL && var2 == NULL)
1331                 selec = DEFAULT_EQ_SEL;
1332         else
1333         {
1334                 HeapTuple       statsTuple1 = NULL;
1335                 HeapTuple       statsTuple2 = NULL;
1336                 Form_pg_statistic stats1 = NULL;
1337                 Form_pg_statistic stats2 = NULL;
1338                 double          nd1 = DEFAULT_NUM_DISTINCT;
1339                 double          nd2 = DEFAULT_NUM_DISTINCT;
1340                 bool            have_mcvs1 = false;
1341                 Datum      *values1 = NULL;
1342                 int                     nvalues1 = 0;
1343                 float4     *numbers1 = NULL;
1344                 int                     nnumbers1 = 0;
1345                 bool            have_mcvs2 = false;
1346                 Datum      *values2 = NULL;
1347                 int                     nvalues2 = 0;
1348                 float4     *numbers2 = NULL;
1349                 int                     nnumbers2 = 0;
1350
1351                 if (var1 != NULL)
1352                 {
1353                         /* get stats for the attribute, if available */
1354                         Oid                     relid1 = getrelid(var1->varno, root->rtable);
1355
1356                         if (relid1 != InvalidOid)
1357                         {
1358                                 statsTuple1 = SearchSysCache(STATRELATT,
1359                                                                                          ObjectIdGetDatum(relid1),
1360                                                                                    Int16GetDatum(var1->varattno),
1361                                                                                          0, 0);
1362                                 if (HeapTupleIsValid(statsTuple1))
1363                                 {
1364                                         stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
1365                                         have_mcvs1 = get_attstatsslot(statsTuple1,
1366                                                                                                   var1->vartype,
1367                                                                                                   var1->vartypmod,
1368                                                                                                   STATISTIC_KIND_MCV,
1369                                                                                                   InvalidOid,
1370                                                                                                   &values1, &nvalues1,
1371                                                                                                   &numbers1, &nnumbers1);
1372                                 }
1373
1374                                 nd1 = get_att_numdistinct(root, var1, stats1);
1375                         }
1376                 }
1377
1378                 if (var2 != NULL)
1379                 {
1380                         /* get stats for the attribute, if available */
1381                         Oid                     relid2 = getrelid(var2->varno, root->rtable);
1382
1383                         if (relid2 != InvalidOid)
1384                         {
1385                                 statsTuple2 = SearchSysCache(STATRELATT,
1386                                                                                          ObjectIdGetDatum(relid2),
1387                                                                                    Int16GetDatum(var2->varattno),
1388                                                                                          0, 0);
1389                                 if (HeapTupleIsValid(statsTuple2))
1390                                 {
1391                                         stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
1392                                         have_mcvs2 = get_attstatsslot(statsTuple2,
1393                                                                                                   var2->vartype,
1394                                                                                                   var2->vartypmod,
1395                                                                                                   STATISTIC_KIND_MCV,
1396                                                                                                   InvalidOid,
1397                                                                                                   &values2, &nvalues2,
1398                                                                                                   &numbers2, &nnumbers2);
1399                                 }
1400
1401                                 nd2 = get_att_numdistinct(root, var2, stats2);
1402                         }
1403                 }
1404
1405                 if (have_mcvs1 && have_mcvs2)
1406                 {
1407                         /*
1408                          * We have most-common-value lists for both relations.  Run
1409                          * through the lists to see which MCVs actually join to each
1410                          * other with the given operator.  This allows us to determine
1411                          * the exact join selectivity for the portion of the relations
1412                          * represented by the MCV lists.  We still have to estimate
1413                          * for the remaining population, but in a skewed distribution
1414                          * this gives us a big leg up in accuracy.      For motivation see
1415                          * the analysis in Y. Ioannidis and S. Christodoulakis, "On
1416                          * the propagation of errors in the size of join results",
1417                          * Technical Report 1018, Computer Science Dept., University
1418                          * of Wisconsin, Madison, March 1991 (available from
1419                          * ftp.cs.wisc.edu).
1420                          */
1421                         FmgrInfo        eqproc;
1422                         bool       *hasmatch1;
1423                         bool       *hasmatch2;
1424                         double          matchprodfreq,
1425                                                 matchfreq1,
1426                                                 matchfreq2,
1427                                                 unmatchfreq1,
1428                                                 unmatchfreq2,
1429                                                 otherfreq1,
1430                                                 otherfreq2,
1431                                                 totalsel1,
1432                                                 totalsel2;
1433                         int                     i,
1434                                                 nmatches;
1435
1436                         fmgr_info(get_opcode(operator), &eqproc);
1437                         hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
1438                         memset(hasmatch1, 0, nvalues1 * sizeof(bool));
1439                         hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
1440                         memset(hasmatch2, 0, nvalues2 * sizeof(bool));
1441
1442                         /*
1443                          * Note we assume that each MCV will match at most one member
1444                          * of the other MCV list.  If the operator isn't really
1445                          * equality, there could be multiple matches --- but we don't
1446                          * look for them, both for speed and because the math wouldn't
1447                          * add up...
1448                          */
1449                         matchprodfreq = 0.0;
1450                         nmatches = 0;
1451                         for (i = 0; i < nvalues1; i++)
1452                         {
1453                                 int                     j;
1454
1455                                 for (j = 0; j < nvalues2; j++)
1456                                 {
1457                                         if (hasmatch2[j])
1458                                                 continue;
1459                                         if (DatumGetBool(FunctionCall2(&eqproc,
1460                                                                                                    values1[i],
1461                                                                                                    values2[j])))
1462                                         {
1463                                                 hasmatch1[i] = hasmatch2[j] = true;
1464                                                 matchprodfreq += numbers1[i] * numbers2[j];
1465                                                 nmatches++;
1466                                                 break;
1467                                         }
1468                                 }
1469                         }
1470                         CLAMP_PROBABILITY(matchprodfreq);
1471                         /* Sum up frequencies of matched and unmatched MCVs */
1472                         matchfreq1 = unmatchfreq1 = 0.0;
1473                         for (i = 0; i < nvalues1; i++)
1474                         {
1475                                 if (hasmatch1[i])
1476                                         matchfreq1 += numbers1[i];
1477                                 else
1478                                         unmatchfreq1 += numbers1[i];
1479                         }
1480                         CLAMP_PROBABILITY(matchfreq1);
1481                         CLAMP_PROBABILITY(unmatchfreq1);
1482                         matchfreq2 = unmatchfreq2 = 0.0;
1483                         for (i = 0; i < nvalues2; i++)
1484                         {
1485                                 if (hasmatch2[i])
1486                                         matchfreq2 += numbers2[i];
1487                                 else
1488                                         unmatchfreq2 += numbers2[i];
1489                         }
1490                         CLAMP_PROBABILITY(matchfreq2);
1491                         CLAMP_PROBABILITY(unmatchfreq2);
1492                         pfree(hasmatch1);
1493                         pfree(hasmatch2);
1494
1495                         /*
1496                          * Compute total frequency of non-null values that are not in
1497                          * the MCV lists.
1498                          */
1499                         otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
1500                         otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
1501                         CLAMP_PROBABILITY(otherfreq1);
1502                         CLAMP_PROBABILITY(otherfreq2);
1503
1504                         /*
1505                          * We can estimate the total selectivity from the point of
1506                          * view of relation 1 as: the known selectivity for matched
1507                          * MCVs, plus unmatched MCVs that are assumed to match against
1508                          * random members of relation 2's non-MCV population, plus
1509                          * non-MCV values that are assumed to match against random
1510                          * members of relation 2's unmatched MCVs plus non-MCV values.
1511                          */
1512                         totalsel1 = matchprodfreq;
1513                         if (nd2 > nvalues2)
1514                                 totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1515                         if (nd2 > nmatches)
1516                                 totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1517                                         (nd2 - nmatches);
1518                         /* Same estimate from the point of view of relation 2. */
1519                         totalsel2 = matchprodfreq;
1520                         if (nd1 > nvalues1)
1521                                 totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1522                         if (nd1 > nmatches)
1523                                 totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1524                                         (nd1 - nmatches);
1525
1526                         /*
1527                          * Use the smaller of the two estimates.  This can be
1528                          * justified in essentially the same terms as given below for
1529                          * the no-stats case: to a first approximation, we are
1530                          * estimating from the point of view of the relation with
1531                          * smaller nd.
1532                          */
1533                         selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1534                 }
1535                 else
1536                 {
1537                         /*
1538                          * We do not have MCV lists for both sides.  Estimate the join
1539                          * selectivity as MIN(1/nd1, 1/nd2).  This is plausible if we
1540                          * assume that the values are about equally distributed: a
1541                          * given tuple of rel1 will join to either 0 or N2/nd2 rows of
1542                          * rel2, so total join rows are at most N1*N2/nd2 giving a
1543                          * join selectivity of not more than 1/nd2.  By the same logic
1544                          * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
1545                          * bound.  Using the MIN() means we estimate from the point of
1546                          * view of the relation with smaller nd (since the larger nd
1547                          * is determining the MIN).  It is reasonable to assume that
1548                          * most tuples in this rel will have join partners, so the
1549                          * bound is probably reasonably tight and should be taken
1550                          * as-is.
1551                          *
1552                          * XXX Can we be smarter if we have an MCV list for just one
1553                          * side? It seems that if we assume equal distribution for the
1554                          * other side, we end up with the same answer anyway.
1555                          */
1556                         if (nd1 > nd2)
1557                                 selec = 1.0 / nd1;
1558                         else
1559                                 selec = 1.0 / nd2;
1560                 }
1561
1562                 if (have_mcvs1)
1563                         free_attstatsslot(var1->vartype, values1, nvalues1,
1564                                                           numbers1, nnumbers1);
1565                 if (have_mcvs2)
1566                         free_attstatsslot(var2->vartype, values2, nvalues2,
1567                                                           numbers2, nnumbers2);
1568                 if (HeapTupleIsValid(statsTuple1))
1569                         ReleaseSysCache(statsTuple1);
1570                 if (HeapTupleIsValid(statsTuple2))
1571                         ReleaseSysCache(statsTuple2);
1572         }
1573
1574         CLAMP_PROBABILITY(selec);
1575
1576         PG_RETURN_FLOAT8((float8) selec);
1577 }
1578
1579 /*
1580  *              neqjoinsel              - Join selectivity of "!="
1581  */
1582 Datum
1583 neqjoinsel(PG_FUNCTION_ARGS)
1584 {
1585         Query      *root = (Query *) PG_GETARG_POINTER(0);
1586         Oid                     operator = PG_GETARG_OID(1);
1587         List       *args = (List *) PG_GETARG_POINTER(2);
1588         Oid                     eqop;
1589         float8          result;
1590
1591         /*
1592          * We want 1 - eqjoinsel() where the equality operator is the one
1593          * associated with this != operator, that is, its negator.
1594          */
1595         eqop = get_negator(operator);
1596         if (eqop)
1597         {
1598                 result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
1599                                                                                                         PointerGetDatum(root),
1600                                                                                                   ObjectIdGetDatum(eqop),
1601                                                                                                  PointerGetDatum(args)));
1602
1603         }
1604         else
1605         {
1606                 /* Use default selectivity (should we raise an error instead?) */
1607                 result = DEFAULT_EQ_SEL;
1608         }
1609         result = 1.0 - result;
1610         PG_RETURN_FLOAT8(result);
1611 }
1612
1613 /*
1614  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
1615  */
1616 Datum
1617 scalarltjoinsel(PG_FUNCTION_ARGS)
1618 {
1619         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1620 }
1621
1622 /*
1623  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
1624  */
1625 Datum
1626 scalargtjoinsel(PG_FUNCTION_ARGS)
1627 {
1628         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1629 }
1630
1631 /*
1632  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
1633  */
1634 Datum
1635 regexeqjoinsel(PG_FUNCTION_ARGS)
1636 {
1637         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1638 }
1639
1640 /*
1641  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
1642  */
1643 Datum
1644 icregexeqjoinsel(PG_FUNCTION_ARGS)
1645 {
1646         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1647 }
1648
1649 /*
1650  *              likejoinsel                     - Join selectivity of LIKE pattern match.
1651  */
1652 Datum
1653 likejoinsel(PG_FUNCTION_ARGS)
1654 {
1655         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1656 }
1657
1658 /*
1659  *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
1660  */
1661 Datum
1662 iclikejoinsel(PG_FUNCTION_ARGS)
1663 {
1664         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1665 }
1666
1667 /*
1668  *              regexnejoinsel  - Join selectivity of regex non-match.
1669  */
1670 Datum
1671 regexnejoinsel(PG_FUNCTION_ARGS)
1672 {
1673         float8          result;
1674
1675         result = DatumGetFloat8(regexeqjoinsel(fcinfo));
1676         result = 1.0 - result;
1677         PG_RETURN_FLOAT8(result);
1678 }
1679
1680 /*
1681  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
1682  */
1683 Datum
1684 icregexnejoinsel(PG_FUNCTION_ARGS)
1685 {
1686         float8          result;
1687
1688         result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
1689         result = 1.0 - result;
1690         PG_RETURN_FLOAT8(result);
1691 }
1692
1693 /*
1694  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
1695  */
1696 Datum
1697 nlikejoinsel(PG_FUNCTION_ARGS)
1698 {
1699         float8          result;
1700
1701         result = DatumGetFloat8(likejoinsel(fcinfo));
1702         result = 1.0 - result;
1703         PG_RETURN_FLOAT8(result);
1704 }
1705
1706 /*
1707  *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
1708  */
1709 Datum
1710 icnlikejoinsel(PG_FUNCTION_ARGS)
1711 {
1712         float8          result;
1713
1714         result = DatumGetFloat8(iclikejoinsel(fcinfo));
1715         result = 1.0 - result;
1716         PG_RETURN_FLOAT8(result);
1717 }
1718
1719 /*
1720  * mergejoinscansel                     - Scan selectivity of merge join.
1721  *
1722  * A merge join will stop as soon as it exhausts either input stream.
1723  * Therefore, if we can estimate the ranges of both input variables,
1724  * we can estimate how much of the input will actually be read.  This
1725  * can have a considerable impact on the cost when using indexscans.
1726  *
1727  * clause should be a clause already known to be mergejoinable.
1728  *
1729  * *leftscan is set to the fraction of the left-hand variable expected
1730  * to be scanned (0 to 1), and similarly *rightscan for the right-hand
1731  * variable.
1732  */
1733 void
1734 mergejoinscansel(Query *root, Node *clause,
1735                                  Selectivity *leftscan,
1736                                  Selectivity *rightscan)
1737 {
1738         Var                *left,
1739                            *right;
1740         Oid                     opno,
1741                                 lsortop,
1742                                 rsortop,
1743                                 ltop,
1744                                 gtop,
1745                                 leop,
1746                                 revgtop,
1747                                 revleop;
1748         Datum           leftmax,
1749                                 rightmax;
1750         double          selec;
1751
1752         /* Set default results if we can't figure anything out. */
1753         *leftscan = *rightscan = 1.0;
1754
1755         /* Deconstruct the merge clause */
1756         if (!is_opclause(clause))
1757                 return;                                 /* shouldn't happen */
1758         opno = ((OpExpr *) clause)->opno;
1759         left = (Var *) get_leftop((Expr *) clause);
1760         right = (Var *) get_rightop((Expr *) clause);
1761         if (!right)
1762                 return;                                 /* shouldn't happen */
1763
1764         /* Can't do anything if inputs are not Vars */
1765         if (!IsA(left, Var) ||
1766                 !IsA(right, Var))
1767                 return;
1768
1769         /* Verify mergejoinability and get left and right "<" operators */
1770         if (!op_mergejoinable(opno,
1771                                                   &lsortop,
1772                                                   &rsortop))
1773                 return;                                 /* shouldn't happen */
1774
1775         /* Try to get maximum values of both vars */
1776         if (!get_var_maximum(root, left, lsortop, &leftmax))
1777                 return;                                 /* no max available from stats */
1778
1779         if (!get_var_maximum(root, right, rsortop, &rightmax))
1780                 return;                                 /* no max available from stats */
1781
1782         /* Look up the "left < right" and "left > right" operators */
1783         op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);
1784
1785         /* Look up the "left <= right" operator */
1786         leop = get_negator(gtop);
1787         if (!OidIsValid(leop))
1788                 return;                                 /* insufficient info in catalogs */
1789
1790         /* Look up the "right > left" operator */
1791         revgtop = get_commutator(ltop);
1792         if (!OidIsValid(revgtop))
1793                 return;                                 /* insufficient info in catalogs */
1794
1795         /* Look up the "right <= left" operator */
1796         revleop = get_negator(revgtop);
1797         if (!OidIsValid(revleop))
1798                 return;                                 /* insufficient info in catalogs */
1799
1800         /*
1801          * Now, the fraction of the left variable that will be scanned is the
1802          * fraction that's <= the right-side maximum value.  But only believe
1803          * non-default estimates, else stick with our 1.0.
1804          */
1805         selec = scalarineqsel(root, leop, false, left,
1806                                                   rightmax, right->vartype);
1807         if (selec != DEFAULT_INEQ_SEL)
1808                 *leftscan = selec;
1809
1810         /* And similarly for the right variable. */
1811         selec = scalarineqsel(root, revleop, false, right,
1812                                                   leftmax, left->vartype);
1813         if (selec != DEFAULT_INEQ_SEL)
1814                 *rightscan = selec;
1815
1816         /*
1817          * Only one of the two fractions can really be less than 1.0; believe
1818          * the smaller estimate and reset the other one to exactly 1.0.  If we
1819          * get exactly equal estimates (as can easily happen with self-joins),
1820          * believe neither.
1821          */
1822         if (*leftscan > *rightscan)
1823                 *leftscan = 1.0;
1824         else if (*leftscan < *rightscan)
1825                 *rightscan = 1.0;
1826         else
1827                 *leftscan = *rightscan = 1.0;
1828 }
1829
1830 /*
1831  * estimate_num_groups          - Estimate number of groups in a grouped query
1832  *
1833  * Given a query having a GROUP BY clause, estimate how many groups there
1834  * will be --- ie, the number of distinct combinations of the GROUP BY
1835  * expressions.
1836  *
1837  * This routine is also used to estimate the number of rows emitted by
1838  * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
1839  * actually, we only use it for DISTINCT when there's no grouping or
1840  * aggregation ahead of the DISTINCT.)
1841  *
1842  * Inputs:
1843  *      root - the query
1844  *      groupExprs - list of expressions being grouped by
1845  *      input_rows - number of rows estimated to arrive at the group/unique
1846  *              filter step
1847  *
1848  * Given the lack of any cross-correlation statistics in the system, it's
1849  * impossible to do anything really trustworthy with GROUP BY conditions
1850  * involving multiple Vars.  We should however avoid assuming the worst
1851  * case (all possible cross-product terms actually appear as groups) since
1852  * very often the grouped-by Vars are highly correlated.  Our current approach
1853  * is as follows:
1854  *      1.      Reduce the given expressions to a list of unique Vars used.  For
1855  *              example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
1856  *              It is clearly correct not to count the same Var more than once.
1857  *              It is also reasonable to treat f(x) the same as x: f() cannot
1858  *              increase the number of distinct values (unless it is volatile,
1859  *              which we consider unlikely for grouping), but it probably won't
1860  *              reduce the number of distinct values much either.
1861  *      2.      If the list contains Vars of different relations that are known equal
1862  *              due to equijoin clauses, then drop all but one of the Vars from each
1863  *              known-equal set, keeping the one with smallest estimated # of values
1864  *              (since the extra values of the others can't appear in joined rows).
1865  *              Note the reason we only consider Vars of different relations is that
1866  *              if we considered ones of the same rel, we'd be double-counting the
1867  *              restriction selectivity of the equality in the next step.
1868  *      3.      For Vars within a single source rel, we multiply together the numbers
1869  *              of values, clamp to the number of rows in the rel, and then multiply
1870  *              by the selectivity of the restriction clauses for that rel.  The
1871  *              initial product is probably too high (it's the worst case) but since
1872  *              we can clamp to the rel's rows it won't be hugely bad.  Multiplying
1873  *              by the restriction selectivity is effectively assuming that the
1874  *              restriction clauses are independent of the grouping, which is a crummy
1875  *              assumption, but it's hard to do better.
1876  *      4.      If there are Vars from multiple rels, we repeat step 3 for each such
1877  *              rel, and multiply the results together.
1878  * Note that rels not containing grouped Vars are ignored completely, as are
1879  * join clauses other than the equijoin clauses used in step 2.  Such rels
1880  * cannot increase the number of groups, and we assume such clauses do not
1881  * reduce the number either (somewhat bogus, but we don't have the info to
1882  * do better).
1883  */
1884 double
1885 estimate_num_groups(Query *root, List *groupExprs, double input_rows)
1886 {
1887         List       *allvars = NIL;
1888         List       *varinfos = NIL;
1889         double          numdistinct;
1890         List       *l;
1891         typedef struct {                        /* varinfos is a List of these */
1892                 Var        *var;
1893                 double  ndistinct;
1894         } MyVarInfo;
1895
1896         /* We should not be called unless query has GROUP BY (or DISTINCT) */
1897         Assert(groupExprs != NIL);
1898
1899         /* Step 1: get the unique Vars used */
1900         foreach(l, groupExprs)
1901         {
1902                 Node       *groupexpr = (Node *) lfirst(l);
1903                 List       *varshere;
1904
1905                 varshere = pull_var_clause(groupexpr, false);
1906                 /*
1907                  * If we find any variable-free GROUP BY item, then either it is
1908                  * a constant (and we can ignore it) or it contains a volatile
1909                  * function; in the latter case we punt and assume that each input
1910                  * row will yield a distinct group.
1911                  */
1912                 if (varshere == NIL)
1913                 {
1914                         if (contain_volatile_functions(groupexpr))
1915                                 return input_rows;
1916                         continue;
1917                 }
1918                 allvars = nconc(allvars, varshere);
1919         }
1920
1921         /* If now no Vars, we must have an all-constant GROUP BY list. */
1922         if (allvars == NIL)
1923                 return 1.0;
1924
1925         /* Use set_union() to discard duplicates */
1926         allvars = set_union(NIL, allvars);
1927
1928         /*
1929          * Step 2: acquire statistical estimate of number of distinct values
1930          * of each Var (total in its table, without regard for filtering).
1931          * Also, detect known-equal Vars and discard the ones we don't want.
1932          */
1933         foreach(l, allvars)
1934         {
1935                 Var        *var = (Var *) lfirst(l);
1936                 Oid             relid = getrelid(var->varno, root->rtable);
1937                 HeapTuple       statsTuple = NULL;
1938                 Form_pg_statistic stats = NULL;
1939                 double ndistinct;
1940                 bool    keep = true;
1941                 List   *l2;
1942
1943                 if (OidIsValid(relid))
1944                 {
1945                         statsTuple = SearchSysCache(STATRELATT,
1946                                                                                 ObjectIdGetDatum(relid),
1947                                                                                 Int16GetDatum(var->varattno),
1948                                                                                 0, 0);
1949                         if (HeapTupleIsValid(statsTuple))
1950                                 stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
1951                 }
1952                 ndistinct = get_att_numdistinct(root, var, stats);
1953                 if (HeapTupleIsValid(statsTuple))
1954                         ReleaseSysCache(statsTuple);
1955
1956                 /* cannot use foreach here because of possible lremove */
1957                 l2 = varinfos;
1958                 while (l2)
1959                 {
1960                         MyVarInfo  *varinfo = (MyVarInfo *) lfirst(l2);
1961
1962                         /* must advance l2 before lremove possibly pfree's it */
1963                         l2 = lnext(l2);
1964
1965                         if (var->varno != varinfo->var->varno &&
1966                                 exprs_known_equal(root, (Node *) var, (Node *) varinfo->var))
1967                         {
1968                                 /* Found a match */
1969                                 if (varinfo->ndistinct <= ndistinct)
1970                                 {
1971                                         /* Keep older item, forget new one */
1972                                         keep = false;
1973                                         break;
1974                                 }
1975                                 else
1976                                 {
1977                                         /* Delete the older item */
1978                                         varinfos = lremove(varinfo, varinfos);
1979                                 }
1980                         }
1981                 }
1982
1983                 if (keep)
1984                 {
1985                         MyVarInfo  *varinfo = (MyVarInfo *) palloc(sizeof(MyVarInfo));
1986
1987                         varinfo->var = var;
1988                         varinfo->ndistinct = ndistinct;
1989                         varinfos = lcons(varinfo, varinfos);
1990                 }
1991         }
1992
1993         /*
1994          * Steps 3/4: group Vars by relation and estimate total numdistinct.
1995          *
1996          * For each iteration of the outer loop, we process the frontmost
1997          * Var in varinfos, plus all other Vars in the same relation.  We
1998          * remove these Vars from the newvarinfos list for the next iteration.
1999          * This is the easiest way to group Vars of same rel together.
2000          */
2001         Assert(varinfos != NIL);
2002         numdistinct = 1.0;
2003
2004         do
2005         {
2006                 MyVarInfo  *varinfo1 = (MyVarInfo *) lfirst(varinfos);
2007                 RelOptInfo *rel = find_base_rel(root, varinfo1->var->varno);
2008                 double  reldistinct = varinfo1->ndistinct;
2009                 List   *newvarinfos = NIL;
2010
2011                 /*
2012                  * Get the largest numdistinct estimate of the Vars for this rel.
2013                  * Also, construct new varinfos list of remaining Vars.
2014                  */
2015                 foreach(l, lnext(varinfos))
2016                 {
2017                         MyVarInfo  *varinfo2 = (MyVarInfo *) lfirst(l);
2018
2019                         if (varinfo2->var->varno == varinfo1->var->varno)
2020                         {
2021                                 reldistinct *= varinfo2->ndistinct;
2022                         }
2023                         else
2024                         {
2025                                 /* not time to process varinfo2 yet */
2026                                 newvarinfos = lcons(varinfo2, newvarinfos);
2027                         }
2028                 }
2029
2030                 /*
2031                  * Clamp to size of rel, multiply by restriction selectivity.
2032                  */
2033                 Assert(rel->reloptkind == RELOPT_BASEREL);
2034                 if (reldistinct > rel->tuples)
2035                         reldistinct = rel->tuples;
2036                 reldistinct *= rel->rows / rel->tuples;
2037
2038                 /*
2039                  * Update estimate of total distinct groups.
2040                  */
2041                 numdistinct *= reldistinct;
2042
2043                 varinfos = newvarinfos;
2044         } while (varinfos != NIL);
2045
2046         /* Guard against out-of-range answers */
2047         if (numdistinct > input_rows)
2048                 numdistinct = input_rows;
2049         if (numdistinct < 1.0)
2050                 numdistinct = 1.0;
2051
2052         return numdistinct;
2053 }
2054
2055
2056 /*-------------------------------------------------------------------------
2057  *
2058  * Support routines
2059  *
2060  *-------------------------------------------------------------------------
2061  */
2062
2063 /*
2064  * get_var_maximum
2065  *              Estimate the maximum value of the specified variable.
2066  *              If successful, store value in *max and return TRUE.
2067  *              If no data available, return FALSE.
2068  *
2069  * sortop is the "<" comparison operator to use.  (To extract the
2070  * minimum instead of the maximum, just pass the ">" operator instead.)
2071  */
2072 static bool
2073 get_var_maximum(Query *root, Var *var, Oid sortop, Datum *max)
2074 {
2075         Datum           tmax = 0;
2076         bool            have_max = false;
2077         Oid                     relid;
2078         HeapTuple       statsTuple;
2079         Form_pg_statistic stats;
2080         int16           typLen;
2081         bool            typByVal;
2082         Datum      *values;
2083         int                     nvalues;
2084         int                     i;
2085
2086         relid = getrelid(var->varno, root->rtable);
2087         if (relid == InvalidOid)
2088                 return false;
2089
2090         /* get stats for the attribute */
2091         statsTuple = SearchSysCache(STATRELATT,
2092                                                                 ObjectIdGetDatum(relid),
2093                                                                 Int16GetDatum(var->varattno),
2094                                                                 0, 0);
2095         if (!HeapTupleIsValid(statsTuple))
2096         {
2097                 /* no stats available, so default result */
2098                 return false;
2099         }
2100         stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
2101
2102         get_typlenbyval(var->vartype, &typLen, &typByVal);
2103
2104         /*
2105          * If there is a histogram, grab the last or first value as
2106          * appropriate.
2107          *
2108          * If there is a histogram that is sorted with some other operator than
2109          * the one we want, fail --- this suggests that there is data we can't
2110          * use.
2111          */
2112         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
2113                                                  STATISTIC_KIND_HISTOGRAM, sortop,
2114                                                  &values, &nvalues,
2115                                                  NULL, NULL))
2116         {
2117                 if (nvalues > 0)
2118                 {
2119                         tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
2120                         have_max = true;
2121                 }
2122                 free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
2123         }
2124         else
2125         {
2126                 Oid                     rsortop = get_commutator(sortop);
2127
2128                 if (OidIsValid(rsortop) &&
2129                         get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
2130                                                          STATISTIC_KIND_HISTOGRAM, rsortop,
2131                                                          &values, &nvalues,
2132                                                          NULL, NULL))
2133                 {
2134                         if (nvalues > 0)
2135                         {
2136                                 tmax = datumCopy(values[0], typByVal, typLen);
2137                                 have_max = true;
2138                         }
2139                         free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
2140                 }
2141                 else if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
2142                                                                   STATISTIC_KIND_HISTOGRAM, InvalidOid,
2143                                                                   &values, &nvalues,
2144                                                                   NULL, NULL))
2145                 {
2146                         free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
2147                         ReleaseSysCache(statsTuple);
2148                         return false;
2149                 }
2150         }
2151
2152         /*
2153          * If we have most-common-values info, look for a large MCV.  This is
2154          * needed even if we also have a histogram, since the histogram
2155          * excludes the MCVs.  However, usually the MCVs will not be the
2156          * extreme values, so avoid unnecessary data copying.
2157          */
2158         if (get_attstatsslot(statsTuple, var->vartype, var->vartypmod,
2159                                                  STATISTIC_KIND_MCV, InvalidOid,
2160                                                  &values, &nvalues,
2161                                                  NULL, NULL))
2162         {
2163                 bool            large_mcv = false;
2164                 FmgrInfo        opproc;
2165
2166                 fmgr_info(get_opcode(sortop), &opproc);
2167
2168                 for (i = 0; i < nvalues; i++)
2169                 {
2170                         if (!have_max)
2171                         {
2172                                 tmax = values[i];
2173                                 large_mcv = have_max = true;
2174                         }
2175                         else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
2176                         {
2177                                 tmax = values[i];
2178                                 large_mcv = true;
2179                         }
2180                 }
2181                 if (large_mcv)
2182                         tmax = datumCopy(tmax, typByVal, typLen);
2183                 free_attstatsslot(var->vartype, values, nvalues, NULL, 0);
2184         }
2185
2186         ReleaseSysCache(statsTuple);
2187
2188         *max = tmax;
2189         return have_max;
2190 }
2191
2192 /*
2193  * convert_to_scalar
2194  *        Convert non-NULL values of the indicated types to the comparison
2195  *        scale needed by scalarltsel()/scalargtsel().
2196  *        Returns "true" if successful.
2197  *
2198  * XXX this routine is a hack: ideally we should look up the conversion
2199  * subroutines in pg_type.
2200  *
2201  * All numeric datatypes are simply converted to their equivalent
2202  * "double" values.  (NUMERIC values that are outside the range of "double"
2203  * are clamped to +/- HUGE_VAL.)
2204  *
2205  * String datatypes are converted by convert_string_to_scalar(),
2206  * which is explained below.  The reason why this routine deals with
2207  * three values at a time, not just one, is that we need it for strings.
2208  *
2209  * The bytea datatype is just enough different from strings that it has
2210  * to be treated separately.
2211  *
2212  * The several datatypes representing absolute times are all converted
2213  * to Timestamp, which is actually a double, and then we just use that
2214  * double value.  Note this will give correct results even for the "special"
2215  * values of Timestamp, since those are chosen to compare correctly;
2216  * see timestamp_cmp.
2217  *
2218  * The several datatypes representing relative times (intervals) are all
2219  * converted to measurements expressed in seconds.
2220  */
2221 static bool
2222 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2223                                   Datum lobound, Datum hibound, Oid boundstypid,
2224                                   double *scaledlobound, double *scaledhibound)
2225 {
2226         switch (valuetypid)
2227         {
2228                         /*
2229                          * Built-in numeric types
2230                          */
2231                 case BOOLOID:
2232                 case INT2OID:
2233                 case INT4OID:
2234                 case INT8OID:
2235                 case FLOAT4OID:
2236                 case FLOAT8OID:
2237                 case NUMERICOID:
2238                 case OIDOID:
2239                 case REGPROCOID:
2240                 case REGPROCEDUREOID:
2241                 case REGOPEROID:
2242                 case REGOPERATOROID:
2243                 case REGCLASSOID:
2244                 case REGTYPEOID:
2245                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2246                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2247                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2248                         return true;
2249
2250                         /*
2251                          * Built-in string types
2252                          */
2253                 case CHAROID:
2254                 case BPCHAROID:
2255                 case VARCHAROID:
2256                 case TEXTOID:
2257                 case NAMEOID:
2258                         {
2259                                 unsigned char *valstr = convert_string_datum(value, valuetypid);
2260                                 unsigned char *lostr = convert_string_datum(lobound, boundstypid);
2261                                 unsigned char *histr = convert_string_datum(hibound, boundstypid);
2262
2263                                 convert_string_to_scalar(valstr, scaledvalue,
2264                                                                                  lostr, scaledlobound,
2265                                                                                  histr, scaledhibound);
2266                                 pfree(valstr);
2267                                 pfree(lostr);
2268                                 pfree(histr);
2269                                 return true;
2270                         }
2271
2272                         /*
2273                          * Built-in bytea type
2274                          */
2275                 case BYTEAOID:
2276                         {
2277                                 convert_bytea_to_scalar(value, scaledvalue,
2278                                                                                 lobound, scaledlobound,
2279                                                                                 hibound, scaledhibound);
2280                                 return true;
2281                         }
2282
2283                         /*
2284                          * Built-in time types
2285                          */
2286                 case TIMESTAMPOID:
2287                 case TIMESTAMPTZOID:
2288                 case ABSTIMEOID:
2289                 case DATEOID:
2290                 case INTERVALOID:
2291                 case RELTIMEOID:
2292                 case TINTERVALOID:
2293                 case TIMEOID:
2294                 case TIMETZOID:
2295                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2296                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2297                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2298                         return true;
2299
2300                         /*
2301                          * Built-in network types
2302                          */
2303                 case INETOID:
2304                 case CIDROID:
2305                 case MACADDROID:
2306                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
2307                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2308                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2309                         return true;
2310         }
2311         /* Don't know how to convert */
2312         return false;
2313 }
2314
2315 /*
2316  * Do convert_to_scalar()'s work for any numeric data type.
2317  */
2318 static double
2319 convert_numeric_to_scalar(Datum value, Oid typid)
2320 {
2321         switch (typid)
2322         {
2323                 case BOOLOID:
2324                         return (double) DatumGetBool(value);
2325                 case INT2OID:
2326                         return (double) DatumGetInt16(value);
2327                 case INT4OID:
2328                         return (double) DatumGetInt32(value);
2329                 case INT8OID:
2330                         return (double) DatumGetInt64(value);
2331                 case FLOAT4OID:
2332                         return (double) DatumGetFloat4(value);
2333                 case FLOAT8OID:
2334                         return (double) DatumGetFloat8(value);
2335                 case NUMERICOID:
2336                         /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2337                         return (double)
2338                                 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2339                                                                                                    value));
2340                 case OIDOID:
2341                 case REGPROCOID:
2342                 case REGPROCEDUREOID:
2343                 case REGOPEROID:
2344                 case REGOPERATOROID:
2345                 case REGCLASSOID:
2346                 case REGTYPEOID:
2347                         /* we can treat OIDs as integers... */
2348                         return (double) DatumGetObjectId(value);
2349         }
2350
2351         /*
2352          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2353          * on an operator with one numeric and one non-numeric operand.
2354          */
2355         elog(ERROR, "convert_numeric_to_scalar: unsupported type %u", typid);
2356         return 0;
2357 }
2358
2359 /*
2360  * Do convert_to_scalar()'s work for any character-string data type.
2361  *
2362  * String datatypes are converted to a scale that ranges from 0 to 1,
2363  * where we visualize the bytes of the string as fractional digits.
2364  *
2365  * We do not want the base to be 256, however, since that tends to
2366  * generate inflated selectivity estimates; few databases will have
2367  * occurrences of all 256 possible byte values at each position.
2368  * Instead, use the smallest and largest byte values seen in the bounds
2369  * as the estimated range for each byte, after some fudging to deal with
2370  * the fact that we probably aren't going to see the full range that way.
2371  *
2372  * An additional refinement is that we discard any common prefix of the
2373  * three strings before computing the scaled values.  This allows us to
2374  * "zoom in" when we encounter a narrow data range.  An example is a phone
2375  * number database where all the values begin with the same area code.
2376  * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2377  * so this is more likely to happen than you might think.)
2378  */
2379 static void
2380 convert_string_to_scalar(unsigned char *value,
2381                                                  double *scaledvalue,
2382                                                  unsigned char *lobound,
2383                                                  double *scaledlobound,
2384                                                  unsigned char *hibound,
2385                                                  double *scaledhibound)
2386 {
2387         int                     rangelo,
2388                                 rangehi;
2389         unsigned char *sptr;
2390
2391         rangelo = rangehi = hibound[0];
2392         for (sptr = lobound; *sptr; sptr++)
2393         {
2394                 if (rangelo > *sptr)
2395                         rangelo = *sptr;
2396                 if (rangehi < *sptr)
2397                         rangehi = *sptr;
2398         }
2399         for (sptr = hibound; *sptr; sptr++)
2400         {
2401                 if (rangelo > *sptr)
2402                         rangelo = *sptr;
2403                 if (rangehi < *sptr)
2404                         rangehi = *sptr;
2405         }
2406         /* If range includes any upper-case ASCII chars, make it include all */
2407         if (rangelo <= 'Z' && rangehi >= 'A')
2408         {
2409                 if (rangelo > 'A')
2410                         rangelo = 'A';
2411                 if (rangehi < 'Z')
2412                         rangehi = 'Z';
2413         }
2414         /* Ditto lower-case */
2415         if (rangelo <= 'z' && rangehi >= 'a')
2416         {
2417                 if (rangelo > 'a')
2418                         rangelo = 'a';
2419                 if (rangehi < 'z')
2420                         rangehi = 'z';
2421         }
2422         /* Ditto digits */
2423         if (rangelo <= '9' && rangehi >= '0')
2424         {
2425                 if (rangelo > '0')
2426                         rangelo = '0';
2427                 if (rangehi < '9')
2428                         rangehi = '9';
2429         }
2430
2431         /*
2432          * If range includes less than 10 chars, assume we have not got enough
2433          * data, and make it include regular ASCII set.
2434          */
2435         if (rangehi - rangelo < 9)
2436         {
2437                 rangelo = ' ';
2438                 rangehi = 127;
2439         }
2440
2441         /*
2442          * Now strip any common prefix of the three strings.
2443          */
2444         while (*lobound)
2445         {
2446                 if (*lobound != *hibound || *lobound != *value)
2447                         break;
2448                 lobound++, hibound++, value++;
2449         }
2450
2451         /*
2452          * Now we can do the conversions.
2453          */
2454         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2455         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2456         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2457 }
2458
2459 static double
2460 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
2461 {
2462         int                     slen = strlen((char *) value);
2463         double          num,
2464                                 denom,
2465                                 base;
2466
2467         if (slen <= 0)
2468                 return 0.0;                             /* empty string has scalar value 0 */
2469
2470         /*
2471          * Since base is at least 10, need not consider more than about 20
2472          * chars
2473          */
2474         if (slen > 20)
2475                 slen = 20;
2476
2477         /* Convert initial characters to fraction */
2478         base = rangehi - rangelo + 1;
2479         num = 0.0;
2480         denom = base;
2481         while (slen-- > 0)
2482         {
2483                 int                     ch = *value++;
2484
2485                 if (ch < rangelo)
2486                         ch = rangelo - 1;
2487                 else if (ch > rangehi)
2488                         ch = rangehi + 1;
2489                 num += ((double) (ch - rangelo)) / denom;
2490                 denom *= base;
2491         }
2492
2493         return num;
2494 }
2495
2496 /*
2497  * Convert a string-type Datum into a palloc'd, null-terminated string.
2498  *
2499  * When using a non-C locale, we must pass the string through strxfrm()
2500  * before continuing, so as to generate correct locale-specific results.
2501  */
2502 static unsigned char *
2503 convert_string_datum(Datum value, Oid typid)
2504 {
2505         char       *val;
2506         char       *xfrmstr;
2507         size_t          xfrmsize;
2508         size_t          xfrmlen;
2509
2510         switch (typid)
2511         {
2512                 case CHAROID:
2513                         val = (char *) palloc(2);
2514                         val[0] = DatumGetChar(value);
2515                         val[1] = '\0';
2516                         break;
2517                 case BPCHAROID:
2518                 case VARCHAROID:
2519                 case TEXTOID:
2520                         {
2521                                 char       *str = (char *) VARDATA(DatumGetPointer(value));
2522                                 int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
2523
2524                                 val = (char *) palloc(strlength + 1);
2525                                 memcpy(val, str, strlength);
2526                                 val[strlength] = '\0';
2527                                 break;
2528                         }
2529                 case NAMEOID:
2530                         {
2531                                 NameData   *nm = (NameData *) DatumGetPointer(value);
2532
2533                                 val = pstrdup(NameStr(*nm));
2534                                 break;
2535                         }
2536                 default:
2537
2538                         /*
2539                          * Can't get here unless someone tries to use scalarltsel on
2540                          * an operator with one string and one non-string operand.
2541                          */
2542                         elog(ERROR, "convert_string_datum: unsupported type %u", typid);
2543                         return NULL;
2544         }
2545
2546         if (!lc_collate_is_c())
2547         {
2548                 /* Guess that transformed string is not much bigger than original */
2549                 xfrmsize = strlen(val) + 32;    /* arbitrary pad value here... */
2550                 xfrmstr = (char *) palloc(xfrmsize);
2551                 xfrmlen = strxfrm(xfrmstr, val, xfrmsize);
2552                 if (xfrmlen >= xfrmsize)
2553                 {
2554                         /* Oops, didn't make it */
2555                         pfree(xfrmstr);
2556                         xfrmstr = (char *) palloc(xfrmlen + 1);
2557                         xfrmlen = strxfrm(xfrmstr, val, xfrmlen + 1);
2558                 }
2559                 pfree(val);
2560                 val = xfrmstr;
2561         }
2562
2563         return (unsigned char *) val;
2564 }
2565
2566 /*
2567  * Do convert_to_scalar()'s work for any bytea data type.
2568  *
2569  * Very similar to convert_string_to_scalar except we can't assume
2570  * null-termination and therefore pass explicit lengths around.
2571  *
2572  * Also, assumptions about likely "normal" ranges of characters have been
2573  * removed - a data range of 0..255 is always used, for now.  (Perhaps
2574  * someday we will add information about actual byte data range to
2575  * pg_statistic.)
2576  */
2577 static void
2578 convert_bytea_to_scalar(Datum value,
2579                                                 double *scaledvalue,
2580                                                 Datum lobound,
2581                                                 double *scaledlobound,
2582                                                 Datum hibound,
2583                                                 double *scaledhibound)
2584 {
2585         int                     rangelo,
2586                                 rangehi,
2587                                 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
2588                                 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
2589                                 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
2590                                 i,
2591                                 minlen;
2592         unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
2593                            *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
2594                            *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
2595
2596         /*
2597          * Assume bytea data is uniformly distributed across all byte values.
2598          */
2599         rangelo = 0;
2600         rangehi = 255;
2601
2602         /*
2603          * Now strip any common prefix of the three strings.
2604          */
2605         minlen = Min(Min(valuelen, loboundlen), hiboundlen);
2606         for (i = 0; i < minlen; i++)
2607         {
2608                 if (*lostr != *histr || *lostr != *valstr)
2609                         break;
2610                 lostr++, histr++, valstr++;
2611                 loboundlen--, hiboundlen--, valuelen--;
2612         }
2613
2614         /*
2615          * Now we can do the conversions.
2616          */
2617         *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
2618         *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
2619         *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
2620 }
2621
2622 static double
2623 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
2624                                                         int rangelo, int rangehi)
2625 {
2626         double          num,
2627                                 denom,
2628                                 base;
2629
2630         if (valuelen <= 0)
2631                 return 0.0;                             /* empty string has scalar value 0 */
2632
2633         /*
2634          * Since base is 256, need not consider more than about 10 chars (even
2635          * this many seems like overkill)
2636          */
2637         if (valuelen > 10)
2638                 valuelen = 10;
2639
2640         /* Convert initial characters to fraction */
2641         base = rangehi - rangelo + 1;
2642         num = 0.0;
2643         denom = base;
2644         while (valuelen-- > 0)
2645         {
2646                 int                     ch = *value++;
2647
2648                 if (ch < rangelo)
2649                         ch = rangelo - 1;
2650                 else if (ch > rangehi)
2651                         ch = rangehi + 1;
2652                 num += ((double) (ch - rangelo)) / denom;
2653                 denom *= base;
2654         }
2655
2656         return num;
2657 }
2658
2659 /*
2660  * Do convert_to_scalar()'s work for any timevalue data type.
2661  */
2662 static double
2663 convert_timevalue_to_scalar(Datum value, Oid typid)
2664 {
2665         switch (typid)
2666         {
2667                 case TIMESTAMPOID:
2668                         return DatumGetTimestamp(value);
2669                 case TIMESTAMPTZOID:
2670                         return DatumGetTimestampTz(value);
2671                 case ABSTIMEOID:
2672                         return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
2673                                                                                                                  value));
2674                 case DATEOID:
2675                         return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
2676                                                                                                                  value));
2677                 case INTERVALOID:
2678                         {
2679                                 Interval   *interval = DatumGetIntervalP(value);
2680
2681                                 /*
2682                                  * Convert the month part of Interval to days using
2683                                  * assumed average month length of 365.25/12.0 days.  Not
2684                                  * too accurate, but plenty good enough for our purposes.
2685                                  */
2686 #ifdef HAVE_INT64_TIMESTAMP
2687                                 return (interval->time + (interval->month * ((365.25 / 12.0) * 86400000000.0)));
2688 #else
2689                                 return interval->time +
2690                                         interval->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
2691 #endif
2692                         }
2693                 case RELTIMEOID:
2694 #ifdef HAVE_INT64_TIMESTAMP
2695                         return (DatumGetRelativeTime(value) * 1000000.0);
2696 #else
2697                         return DatumGetRelativeTime(value);
2698 #endif
2699                 case TINTERVALOID:
2700                         {
2701                                 TimeInterval interval = DatumGetTimeInterval(value);
2702
2703 #ifdef HAVE_INT64_TIMESTAMP
2704                                 if (interval->status != 0)
2705                                         return ((interval->data[1] - interval->data[0]) * 1000000.0);
2706 #else
2707                                 if (interval->status != 0)
2708                                         return interval->data[1] - interval->data[0];
2709 #endif
2710                                 return 0;               /* for lack of a better idea */
2711                         }
2712                 case TIMEOID:
2713                         return DatumGetTimeADT(value);
2714                 case TIMETZOID:
2715                         {
2716                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
2717
2718                                 /* use GMT-equivalent time */
2719 #ifdef HAVE_INT64_TIMESTAMP
2720                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
2721 #else
2722                                 return (double) (timetz->time + timetz->zone);
2723 #endif
2724                         }
2725         }
2726
2727         /*
2728          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2729          * on an operator with one timevalue and one non-timevalue operand.
2730          */
2731         elog(ERROR, "convert_timevalue_to_scalar: unsupported type %u", typid);
2732         return 0;
2733 }
2734
2735
2736 /*
2737  * get_att_numdistinct
2738  *        Estimate the number of distinct values of an attribute.
2739  *
2740  * var: identifies the attribute to examine.
2741  * stats: pg_statistic tuple for attribute, or NULL if not available.
2742  *
2743  * NB: be careful to produce an integral result, since callers may compare
2744  * the result to exact integer counts.
2745  */
2746 static double
2747 get_att_numdistinct(Query *root, Var *var, Form_pg_statistic stats)
2748 {
2749         RelOptInfo *rel;
2750         double          ntuples;
2751
2752         /*
2753          * Special-case boolean columns: presumably, two distinct values.
2754          *
2755          * Are there any other cases we should wire in special estimates for?
2756          */
2757         if (var->vartype == BOOLOID)
2758                 return 2.0;
2759
2760         /*
2761          * Otherwise we need to get the relation size.
2762          */
2763         rel = find_base_rel(root, var->varno);
2764         ntuples = rel->tuples;
2765
2766         if (ntuples <= 0.0)
2767                 return DEFAULT_NUM_DISTINCT;    /* no data available; return a
2768                                                                                  * default */
2769
2770         /*
2771          * Look to see if there is a unique index on the attribute. If so, we
2772          * assume it's distinct, ignoring pg_statistic info which could be out
2773          * of date.
2774          */
2775         if (has_unique_index(rel, var->varattno))
2776                 return ntuples;
2777
2778         /*
2779          * If ANALYZE determined a fixed or scaled estimate, use it.
2780          */
2781         if (stats)
2782         {
2783                 if (stats->stadistinct > 0.0)
2784                         return stats->stadistinct;
2785                 if (stats->stadistinct < 0.0)
2786                         return floor((-stats->stadistinct * ntuples) + 0.5);
2787         }
2788
2789         /*
2790          * ANALYZE does not compute stats for system attributes, but some of
2791          * them can reasonably be assumed unique anyway.
2792          */
2793         switch (var->varattno)
2794         {
2795                 case ObjectIdAttributeNumber:
2796                 case SelfItemPointerAttributeNumber:
2797                         return ntuples;
2798                 case TableOidAttributeNumber:
2799                         return 1.0;
2800         }
2801
2802         /*
2803          * Estimate ndistinct = ntuples if the table is small, else use
2804          * default.
2805          */
2806         if (ntuples < DEFAULT_NUM_DISTINCT)
2807                 return ntuples;
2808
2809         return DEFAULT_NUM_DISTINCT;
2810 }
2811
2812 /*
2813  * get_restriction_var
2814  *              Examine the args of a restriction clause to see if it's of the
2815  *              form (var op something) or (something op var).  If so, extract
2816  *              and return the var and the other argument.
2817  *
2818  * Inputs:
2819  *      args: clause argument list
2820  *      varRelid: see specs for restriction selectivity functions
2821  *
2822  * Outputs: (these are set only if TRUE is returned)
2823  *      *var: gets Var node
2824  *      *other: gets other clause argument
2825  *      *varonleft: set TRUE if var is on the left, FALSE if on the right
2826  *
2827  * Returns TRUE if a Var is identified, otherwise FALSE.
2828  */
2829 static bool
2830 get_restriction_var(List *args,
2831                                         int varRelid,
2832                                         Var **var,
2833                                         Node **other,
2834                                         bool *varonleft)
2835 {
2836         Node       *left,
2837                            *right;
2838
2839         if (length(args) != 2)
2840                 return false;
2841
2842         left = (Node *) lfirst(args);
2843         right = (Node *) lsecond(args);
2844
2845         /* Ignore any binary-compatible relabeling */
2846
2847         if (IsA(left, RelabelType))
2848                 left = (Node *) ((RelabelType *) left)->arg;
2849         if (IsA(right, RelabelType))
2850                 right = (Node *) ((RelabelType *) right)->arg;
2851
2852         /* Look for the var */
2853
2854         if (IsA(left, Var) &&
2855                 (varRelid == 0 || varRelid == ((Var *) left)->varno))
2856         {
2857                 *var = (Var *) left;
2858                 *other = right;
2859                 *varonleft = true;
2860         }
2861         else if (IsA(right, Var) &&
2862                          (varRelid == 0 || varRelid == ((Var *) right)->varno))
2863         {
2864                 *var = (Var *) right;
2865                 *other = left;
2866                 *varonleft = false;
2867         }
2868         else
2869         {
2870                 /* Duh, it's too complicated for me... */
2871                 return false;
2872         }
2873
2874         return true;
2875 }
2876
2877 /*
2878  * get_join_vars
2879  *
2880  * Extract the two Vars from a join clause's argument list.  Returns
2881  * NULL for arguments that are not simple vars.
2882  */
2883 static void
2884 get_join_vars(List *args, Var **var1, Var **var2)
2885 {
2886         Node       *left,
2887                            *right;
2888
2889         if (length(args) != 2)
2890         {
2891                 *var1 = NULL;
2892                 *var2 = NULL;
2893                 return;
2894         }
2895
2896         left = (Node *) lfirst(args);
2897         right = (Node *) lsecond(args);
2898
2899         /* Ignore any binary-compatible relabeling */
2900         if (IsA(left, RelabelType))
2901                 left = (Node *) ((RelabelType *) left)->arg;
2902         if (IsA(right, RelabelType))
2903                 right = (Node *) ((RelabelType *) right)->arg;
2904
2905         if (IsA(left, Var))
2906                 *var1 = (Var *) left;
2907         else
2908                 *var1 = NULL;
2909
2910         if (IsA(right, Var))
2911                 *var2 = (Var *) right;
2912         else
2913                 *var2 = NULL;
2914 }
2915
2916 /*-------------------------------------------------------------------------
2917  *
2918  * Pattern analysis functions
2919  *
2920  * These routines support analysis of LIKE and regular-expression patterns
2921  * by the planner/optimizer.  It's important that they agree with the
2922  * regular-expression code in backend/regex/ and the LIKE code in
2923  * backend/utils/adt/like.c.
2924  *
2925  * Note that the prefix-analysis functions are called from
2926  * backend/optimizer/path/indxpath.c as well as from routines in this file.
2927  *
2928  *-------------------------------------------------------------------------
2929  */
2930
2931 /*
2932  * Extract the fixed prefix, if any, for a pattern.
2933  * *prefix is set to a palloc'd prefix string,
2934  * or to NULL if no fixed prefix exists for the pattern.
2935  * *rest is set to point to the remainder of the pattern after the
2936  * portion describing the fixed prefix.
2937  * The return value distinguishes no fixed prefix, a partial prefix,
2938  * or an exact-match-only pattern.
2939  */
2940
2941 static Pattern_Prefix_Status
2942 like_fixed_prefix(Const *patt_const, bool case_insensitive,
2943                                   Const **prefix_const, Const **rest_const)
2944 {
2945         char       *match;
2946         char       *patt;
2947         int                     pattlen;
2948         char       *prefix;
2949         char       *rest;
2950         Oid                     typeid = patt_const->consttype;
2951         int                     pos,
2952                                 match_pos;
2953
2954         /* the right-hand const is type text or bytea */
2955         Assert(typeid == BYTEAOID || typeid == TEXTOID);
2956
2957         if (typeid == BYTEAOID && case_insensitive)
2958                 elog(ERROR, "Cannot perform case insensitive matching on type BYTEA");
2959
2960         if (typeid != BYTEAOID)
2961         {
2962                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
2963                 pattlen = strlen(patt);
2964         }
2965         else
2966         {
2967                 patt = DatumGetCString(DirectFunctionCall1(byteaout, patt_const->constvalue));
2968                 pattlen = toast_raw_datum_size(patt_const->constvalue) - VARHDRSZ;
2969         }
2970
2971         prefix = match = palloc(pattlen + 1);
2972         match_pos = 0;
2973
2974         for (pos = 0; pos < pattlen; pos++)
2975         {
2976                 /* % and _ are wildcard characters in LIKE */
2977                 if (patt[pos] == '%' ||
2978                         patt[pos] == '_')
2979                         break;
2980                 /* Backslash quotes the next character */
2981                 if (patt[pos] == '\\')
2982                 {
2983                         pos++;
2984                         if (patt[pos] == '\0' && typeid != BYTEAOID)
2985                                 break;
2986                 }
2987
2988                 /*
2989                  * XXX I suspect isalpha() is not an adequately locale-sensitive
2990                  * test for characters that can vary under case folding?
2991                  */
2992                 if (case_insensitive && isalpha((unsigned char) patt[pos]))
2993                         break;
2994
2995                 /*
2996                  * NOTE: this code used to think that %% meant a literal %, but
2997                  * textlike() itself does not think that, and the SQL92 spec
2998                  * doesn't say any such thing either.
2999                  */
3000                 match[match_pos++] = patt[pos];
3001         }
3002
3003         match[match_pos] = '\0';
3004         rest = &patt[pos];
3005
3006         *prefix_const = string_to_const(prefix, typeid);
3007         *rest_const = string_to_const(rest, typeid);
3008
3009         pfree(patt);
3010         pfree(match);
3011         prefix = NULL;
3012
3013         /* in LIKE, an empty pattern is an exact match! */
3014         if (pos == pattlen)
3015                 return Pattern_Prefix_Exact;    /* reached end of pattern, so
3016                                                                                  * exact */
3017
3018         if (match_pos > 0)
3019                 return Pattern_Prefix_Partial;
3020
3021         return Pattern_Prefix_None;
3022 }
3023
3024 static Pattern_Prefix_Status
3025 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3026                                    Const **prefix_const, Const **rest_const)
3027 {
3028         char       *match;
3029         int                     pos,
3030                                 match_pos,
3031                                 paren_depth;
3032         char       *patt;
3033         char       *prefix;
3034         char       *rest;
3035         Oid                     typeid = patt_const->consttype;
3036
3037         /*
3038          * Should be unnecessary, there are no bytea regex operators defined.
3039          * As such, it should be noted that the rest of this function has *not*
3040          * been made safe for binary (possibly NULL containing) strings.
3041          */
3042         if (typeid == BYTEAOID)
3043                 elog(ERROR, "Regex matching not supported on type BYTEA");
3044
3045         /* the right-hand const is type text for all of these */
3046         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3047
3048         /* Pattern must be anchored left */
3049         if (patt[0] != '^')
3050         {
3051                 rest = patt;
3052
3053                 *prefix_const = NULL;
3054                 *rest_const = string_to_const(rest, typeid);
3055
3056                 return Pattern_Prefix_None;
3057         }
3058
3059         /*
3060          * If unquoted | is present at paren level 0 in pattern, then there
3061          * are multiple alternatives for the start of the string.
3062          */
3063         paren_depth = 0;
3064         for (pos = 1; patt[pos]; pos++)
3065         {
3066                 if (patt[pos] == '|' && paren_depth == 0)
3067                 {
3068                         rest = patt;
3069
3070                         *prefix_const = NULL;
3071                         *rest_const = string_to_const(rest, typeid);
3072
3073                         return Pattern_Prefix_None;
3074                 }
3075                 else if (patt[pos] == '(')
3076                         paren_depth++;
3077                 else if (patt[pos] == ')' && paren_depth > 0)
3078                         paren_depth--;
3079                 else if (patt[pos] == '\\')
3080                 {
3081                         /* backslash quotes the next character */
3082                         pos++;
3083                         if (patt[pos] == '\0')
3084                                 break;
3085                 }
3086         }
3087
3088         /* OK, allocate space for pattern */
3089         prefix = match = palloc(strlen(patt) + 1);
3090         match_pos = 0;
3091
3092         /* note start at pos 1 to skip leading ^ */
3093         for (pos = 1; patt[pos]; pos++)
3094         {
3095                 /*
3096                  * Check for characters that indicate multiple possible matches
3097                  * here. XXX I suspect isalpha() is not an adequately
3098                  * locale-sensitive test for characters that can vary under case
3099                  * folding?
3100                  */
3101                 if (patt[pos] == '.' ||
3102                         patt[pos] == '(' ||
3103                         patt[pos] == '[' ||
3104                         patt[pos] == '$' ||
3105                         (case_insensitive && isalpha((unsigned char) patt[pos])))
3106                         break;
3107
3108                 /*
3109                  * Check for quantifiers.  Except for +, this means the preceding
3110                  * character is optional, so we must remove it from the prefix
3111                  * too!
3112                  */
3113                 if (patt[pos] == '*' ||
3114                         patt[pos] == '?' ||
3115                         patt[pos] == '{')
3116                 {
3117                         if (match_pos > 0)
3118                                 match_pos--;
3119                         pos--;
3120                         break;
3121                 }
3122                 if (patt[pos] == '+')
3123                 {
3124                         pos--;
3125                         break;
3126                 }
3127                 if (patt[pos] == '\\')
3128                 {
3129                         /* backslash quotes the next character */
3130                         pos++;
3131                         if (patt[pos] == '\0')
3132                                 break;
3133                 }
3134                 match[match_pos++] = patt[pos];
3135         }
3136
3137         match[match_pos] = '\0';
3138         rest = &patt[pos];
3139
3140         if (patt[pos] == '$' && patt[pos + 1] == '\0')
3141         {
3142                 rest = &patt[pos + 1];
3143
3144                 *prefix_const = string_to_const(prefix, typeid);
3145                 *rest_const = string_to_const(rest, typeid);
3146
3147                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
3148         }
3149
3150         *prefix_const = string_to_const(prefix, typeid);
3151         *rest_const = string_to_const(rest, typeid);
3152
3153         pfree(patt);
3154         pfree(match);
3155         prefix = NULL;
3156
3157         if (match_pos > 0)
3158                 return Pattern_Prefix_Partial;
3159
3160         return Pattern_Prefix_None;
3161 }
3162
3163 Pattern_Prefix_Status
3164 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
3165                                          Const **prefix, Const **rest)
3166 {
3167         Pattern_Prefix_Status result;
3168
3169         switch (ptype)
3170         {
3171                 case Pattern_Type_Like:
3172                         result = like_fixed_prefix(patt, false, prefix, rest);
3173                         break;
3174                 case Pattern_Type_Like_IC:
3175                         result = like_fixed_prefix(patt, true, prefix, rest);
3176                         break;
3177                 case Pattern_Type_Regex:
3178                         result = regex_fixed_prefix(patt, false, prefix, rest);
3179                         break;
3180                 case Pattern_Type_Regex_IC:
3181                         result = regex_fixed_prefix(patt, true, prefix, rest);
3182                         break;
3183                 default:
3184                         elog(ERROR, "pattern_fixed_prefix: bogus ptype");
3185                         result = Pattern_Prefix_None;           /* keep compiler quiet */
3186                         break;
3187         }
3188         return result;
3189 }
3190
3191 /*
3192  * Estimate the selectivity of a fixed prefix for a pattern match.
3193  *
3194  * A fixed prefix "foo" is estimated as the selectivity of the expression
3195  * "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
3196  *
3197  * XXX Note: we make use of the upper bound to estimate operator selectivity
3198  * even if the locale is such that we cannot rely on the upper-bound string.
3199  * The selectivity only needs to be approximately right anyway, so it seems
3200  * more useful to use the upper-bound code than not.
3201  */
3202 static Selectivity
3203 prefix_selectivity(Query *root, Var *var, Const *prefixcon)
3204 {
3205         Selectivity prefixsel;
3206         Oid                     cmpopr;
3207         char       *prefix;
3208         List       *cmpargs;
3209         Const      *greaterstrcon;
3210
3211         cmpopr = find_operator(">=", var->vartype);
3212         if (cmpopr == InvalidOid)
3213                 elog(ERROR, "prefix_selectivity: no >= operator for type %u",
3214                          var->vartype);
3215         if (prefixcon->consttype != BYTEAOID)
3216                 prefix = DatumGetCString(DirectFunctionCall1(textout, prefixcon->constvalue));
3217         else
3218                 prefix = DatumGetCString(DirectFunctionCall1(byteaout, prefixcon->constvalue));
3219
3220         /* If var is type NAME, must adjust type of comparison constant */
3221         if (var->vartype == NAMEOID)
3222                 prefixcon = string_to_const(prefix, NAMEOID);
3223
3224         cmpargs = makeList2(var, prefixcon);
3225         /* Assume scalargtsel is appropriate for all supported types */
3226         prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
3227                                                                                                    PointerGetDatum(root),
3228                                                                                                 ObjectIdGetDatum(cmpopr),
3229                                                                                                 PointerGetDatum(cmpargs),
3230                                                                                                    Int32GetDatum(0)));
3231
3232         /*-------
3233          * If we can create a string larger than the prefix, say
3234          *      "x < greaterstr".
3235          *-------
3236          */
3237         greaterstrcon = make_greater_string(prefixcon);
3238         if (greaterstrcon)
3239         {
3240                 Selectivity topsel;
3241
3242                 cmpopr = find_operator("<", var->vartype);
3243                 if (cmpopr == InvalidOid)
3244                         elog(ERROR, "prefix_selectivity: no < operator for type %u",
3245                                  var->vartype);
3246                 cmpargs = makeList2(var, greaterstrcon);
3247                 /* Assume scalarltsel is appropriate for all supported types */
3248                 topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
3249                                                                                                         PointerGetDatum(root),
3250                                                                                                 ObjectIdGetDatum(cmpopr),
3251                                                                                                 PointerGetDatum(cmpargs),
3252                                                                                                         Int32GetDatum(0)));
3253
3254                 /*
3255                  * Merge the two selectivities in the same way as for a range
3256                  * query (see clauselist_selectivity()).
3257                  */
3258                 prefixsel = topsel + prefixsel - 1.0;
3259
3260                 /* Adjust for double-exclusion of NULLs */
3261                 prefixsel += nulltestsel(root, IS_NULL, (Node *) var, var->varno);
3262
3263                 /*
3264                  * A zero or slightly negative prefixsel should be converted into
3265                  * a small positive value; we probably are dealing with a very
3266                  * tight range and got a bogus result due to roundoff errors.
3267                  * However, if prefixsel is very negative, then we probably have
3268                  * default selectivity estimates on one or both sides of the
3269                  * range.  In that case, insert a not-so-wildly-optimistic default
3270                  * estimate.
3271                  */
3272                 if (prefixsel <= 0.0)
3273                 {
3274                         if (prefixsel < -0.01)
3275                         {
3276                                 /*
3277                                  * No data available --- use a default estimate that is
3278                                  * small, but not real small.
3279                                  */
3280                                 prefixsel = 0.005;
3281                         }
3282                         else
3283                         {
3284                                 /*
3285                                  * It's just roundoff error; use a small positive value
3286                                  */
3287                                 prefixsel = 1.0e-10;
3288                         }
3289                 }
3290         }
3291
3292         return prefixsel;
3293 }
3294
3295
3296 /*
3297  * Estimate the selectivity of a pattern of the specified type.
3298  * Note that any fixed prefix of the pattern will have been removed already.
3299  *
3300  * For now, we use a very simplistic approach: fixed characters reduce the
3301  * selectivity a good deal, character ranges reduce it a little,
3302  * wildcards (such as % for LIKE or .* for regex) increase it.
3303  */
3304
3305 #define FIXED_CHAR_SEL  0.20    /* about 1/5 */
3306 #define CHAR_RANGE_SEL  0.25
3307 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match
3308                                                                  * end-of-string */
3309 #define FULL_WILDCARD_SEL 5.0
3310 #define PARTIAL_WILDCARD_SEL 2.0
3311
3312 static Selectivity
3313 like_selectivity(Const *patt_const, bool case_insensitive)
3314 {
3315         Selectivity sel = 1.0;
3316         int                     pos;
3317         int                     start;
3318         Oid                     typeid = patt_const->consttype;
3319         char       *patt;
3320         int                     pattlen;
3321
3322         /* the right-hand const is type text or bytea */
3323         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3324
3325         if (typeid == BYTEAOID && case_insensitive)
3326                 elog(ERROR, "Cannot perform case insensitive matching on type BYTEA");
3327
3328         if (typeid != BYTEAOID)
3329         {
3330                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3331                 pattlen = strlen(patt);
3332         }
3333         else
3334         {
3335                 patt = DatumGetCString(DirectFunctionCall1(byteaout, patt_const->constvalue));
3336                 pattlen = toast_raw_datum_size(patt_const->constvalue) - VARHDRSZ;
3337         }
3338
3339         /* Skip any leading %; it's already factored into initial sel */
3340         start = (*patt == '%') ? 1 : 0;
3341         for (pos = start; pos < pattlen; pos++)
3342         {
3343                 /* % and _ are wildcard characters in LIKE */
3344                 if (patt[pos] == '%')
3345                         sel *= FULL_WILDCARD_SEL;
3346                 else if (patt[pos] == '_')
3347                         sel *= ANY_CHAR_SEL;
3348                 else if (patt[pos] == '\\')
3349                 {
3350                         /* Backslash quotes the next character */
3351                         pos++;
3352                         if (patt[pos] == '\0' && typeid != BYTEAOID)
3353                                 break;
3354                         sel *= FIXED_CHAR_SEL;
3355                 }
3356                 else
3357                         sel *= FIXED_CHAR_SEL;
3358         }
3359         /* Could get sel > 1 if multiple wildcards */
3360         if (sel > 1.0)
3361                 sel = 1.0;
3362         return sel;
3363 }
3364
3365 static Selectivity
3366 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
3367 {
3368         Selectivity sel = 1.0;
3369         int                     paren_depth = 0;
3370         int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
3371         int                     pos;
3372
3373         for (pos = 0; pos < pattlen; pos++)
3374         {
3375                 if (patt[pos] == '(')
3376                 {
3377                         if (paren_depth == 0)
3378                                 paren_pos = pos;        /* remember start of parenthesized item */
3379                         paren_depth++;
3380                 }
3381                 else if (patt[pos] == ')' && paren_depth > 0)
3382                 {
3383                         paren_depth--;
3384                         if (paren_depth == 0)
3385                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
3386                                                                                          pos - (paren_pos + 1),
3387                                                                                          case_insensitive);
3388                 }
3389                 else if (patt[pos] == '|' && paren_depth == 0)
3390                 {
3391                         /*
3392                          * If unquoted | is present at paren level 0 in pattern, we
3393                          * have multiple alternatives; sum their probabilities.
3394                          */
3395                         sel += regex_selectivity_sub(patt + (pos + 1),
3396                                                                                  pattlen - (pos + 1),
3397                                                                                  case_insensitive);
3398                         break;                          /* rest of pattern is now processed */
3399                 }
3400                 else if (patt[pos] == '[')
3401                 {
3402                         bool            negclass = false;
3403
3404                         if (patt[++pos] == '^')
3405                         {
3406                                 negclass = true;
3407                                 pos++;
3408                         }
3409                         if (patt[pos] == ']')           /* ']' at start of class is not
3410                                                                                  * special */
3411                                 pos++;
3412                         while (pos < pattlen && patt[pos] != ']')
3413                                 pos++;
3414                         if (paren_depth == 0)
3415                                 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
3416                 }
3417                 else if (patt[pos] == '.')
3418                 {
3419                         if (paren_depth == 0)
3420                                 sel *= ANY_CHAR_SEL;
3421                 }
3422                 else if (patt[pos] == '*' ||
3423                                  patt[pos] == '?' ||
3424                                  patt[pos] == '+')
3425                 {
3426                         /* Ought to be smarter about quantifiers... */
3427                         if (paren_depth == 0)
3428                                 sel *= PARTIAL_WILDCARD_SEL;
3429                 }
3430                 else if (patt[pos] == '{')
3431                 {
3432                         while (pos < pattlen && patt[pos] != '}')
3433                                 pos++;
3434                         if (paren_depth == 0)
3435                                 sel *= PARTIAL_WILDCARD_SEL;
3436                 }
3437                 else if (patt[pos] == '\\')
3438                 {
3439                         /* backslash quotes the next character */
3440                         pos++;
3441                         if (pos >= pattlen)
3442                                 break;
3443                         if (paren_depth == 0)
3444                                 sel *= FIXED_CHAR_SEL;
3445                 }
3446                 else
3447                 {
3448                         if (paren_depth == 0)
3449                                 sel *= FIXED_CHAR_SEL;
3450                 }
3451         }
3452         /* Could get sel > 1 if multiple wildcards */
3453         if (sel > 1.0)
3454                 sel = 1.0;
3455         return sel;
3456 }
3457
3458 static Selectivity
3459 regex_selectivity(Const *patt_const, bool case_insensitive)
3460 {
3461         Selectivity sel;
3462         char       *patt;
3463         int                     pattlen;
3464         Oid                     typeid = patt_const->consttype;
3465
3466         /*
3467          * Should be unnecessary, there are no bytea regex operators defined.
3468          * As such, it should be noted that the rest of this function has *not*
3469          * been made safe for binary (possibly NULL containing) strings.
3470          */
3471         if (typeid == BYTEAOID)
3472                 elog(ERROR, "Regex matching not supported on type BYTEA");
3473
3474         /* the right-hand const is type text for all of these */
3475         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3476         pattlen = strlen(patt);
3477
3478         /* If patt doesn't end with $, consider it to have a trailing wildcard */
3479         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
3480                 (pattlen == 1 || patt[pattlen - 2] != '\\'))
3481         {
3482                 /* has trailing $ */
3483                 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
3484         }
3485         else
3486         {
3487                 /* no trailing $ */
3488                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
3489                 sel *= FULL_WILDCARD_SEL;
3490                 if (sel > 1.0)
3491                         sel = 1.0;
3492         }
3493         return sel;
3494 }
3495
3496 static Selectivity
3497 pattern_selectivity(Const *patt, Pattern_Type ptype)
3498 {
3499         Selectivity result;
3500
3501         switch (ptype)
3502         {
3503                 case Pattern_Type_Like:
3504                         result = like_selectivity(patt, false);
3505                         break;
3506                 case Pattern_Type_Like_IC:
3507                         result = like_selectivity(patt, true);
3508                         break;
3509                 case Pattern_Type_Regex:
3510                         result = regex_selectivity(patt, false);
3511                         break;
3512                 case Pattern_Type_Regex_IC:
3513                         result = regex_selectivity(patt, true);
3514                         break;
3515                 default:
3516                         elog(ERROR, "pattern_selectivity: bogus ptype");
3517                         result = 1.0;           /* keep compiler quiet */
3518                         break;
3519         }
3520         return result;
3521 }
3522
3523
3524 /*
3525  * We want to test whether the database's LC_COLLATE setting is safe for
3526  * LIKE/regexp index optimization.
3527  *
3528  * The key requirement here is that given a prefix string, say "foo",
3529  * we must be able to generate another string "fop" that is greater
3530  * than all strings "foobar" starting with "foo".  Unfortunately, a
3531  * non-C locale may have arbitrary collation rules in which "fop" >
3532  * "foo" is not sufficient to ensure "fop" > "foobar".  Until we can
3533  * come up with a more bulletproof way of generating the upper-bound
3534  * string, the optimization is disabled in all non-C locales.
3535  *
3536  * (In theory, locales other than C may be LIKE-safe so this function
3537  * could be different from lc_collate_is_c(), but in a different
3538  * theory, non-C locales are completely unpredictable so it's unlikely
3539  * to happen.)
3540  *
3541  * Be sure to maintain the correspondence with the code in initdb.
3542  */
3543 bool
3544 locale_is_like_safe(void)
3545 {
3546         return lc_collate_is_c();
3547 }
3548
3549 /*
3550  * Try to generate a string greater than the given string or any string it is
3551  * a prefix of.  If successful, return a palloc'd string; else return NULL.
3552  *
3553  * To work correctly in non-ASCII locales with weird collation orders,
3554  * we cannot simply increment "foo" to "fop" --- we have to check whether
3555  * we actually produced a string greater than the given one.  If not,
3556  * increment the righthand byte again and repeat.  If we max out the righthand
3557  * byte, truncate off the last character and start incrementing the next.
3558  * For example, if "z" were the last character in the sort order, then we
3559  * could produce "foo" as a string greater than "fonz".
3560  *
3561  * This could be rather slow in the worst case, but in most cases we won't
3562  * have to try more than one or two strings before succeeding.
3563  *
3564  * XXX this is actually not sufficient, since it only copes with the case
3565  * where individual characters collate in an order different from their
3566  * numeric code assignments.  It does not handle cases where there are
3567  * cross-character effects, such as specially sorted digraphs, multiple
3568  * sort passes, etc.  For now, we just shut down the whole thing in locales
3569  * that do such things :-(
3570  */
3571 Const *
3572 make_greater_string(const Const *str_const)
3573 {
3574         Oid                     datatype = str_const->consttype;
3575         char       *str;
3576         char       *workstr;
3577         int                     len;
3578
3579         /* Get the string and a modifiable copy */
3580         if (datatype == NAMEOID)
3581         {
3582                 str = DatumGetCString(DirectFunctionCall1(nameout, str_const->constvalue));
3583                 len = strlen(str);
3584         }
3585         else if (datatype == BYTEAOID)
3586         {
3587                 str = DatumGetCString(DirectFunctionCall1(byteaout, str_const->constvalue));
3588                 len = toast_raw_datum_size(str_const->constvalue) - VARHDRSZ;
3589         }
3590         else
3591         {
3592                 str = DatumGetCString(DirectFunctionCall1(textout, str_const->constvalue));
3593                 len = strlen(str);
3594         }
3595         workstr = pstrdup(str);
3596
3597         while (len > 0)
3598         {
3599                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
3600                 unsigned char savelastchar = *lastchar;
3601
3602                 /*
3603                  * Try to generate a larger string by incrementing the last byte.
3604                  */
3605                 while (*lastchar < (unsigned char) 255)
3606                 {
3607                         (*lastchar)++;
3608                         if (string_lessthan(str, workstr, datatype))
3609                         {
3610                                 /* Success! */
3611                                 Const      *workstr_const = string_to_const(workstr, datatype);
3612
3613                                 pfree(str);
3614                                 pfree(workstr);
3615                                 return workstr_const;
3616                         }
3617                 }
3618
3619                 /* restore last byte so we don't confuse pg_mbcliplen */
3620                 *lastchar = savelastchar;
3621
3622                 /*
3623                  * Truncate off the last character, which might be more than 1
3624                  * byte, depending on the character encoding.
3625                  */
3626                 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
3627                         len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
3628                 else
3629                         len -= 1;
3630
3631                 if (datatype != BYTEAOID)
3632                         workstr[len] = '\0';
3633         }
3634
3635         /* Failed... */
3636         pfree(str);
3637         pfree(workstr);
3638
3639         return (Const *) NULL;
3640 }
3641
3642 /*
3643  * Test whether two strings are "<" according to the rules of the given
3644  * datatype.  We do this the hard way, ie, actually calling the type's
3645  * "<" operator function, to ensure we get the right result...
3646  */
3647 static bool
3648 string_lessthan(const char *str1, const char *str2, Oid datatype)
3649 {
3650         Datum           datum1 = string_to_datum(str1, datatype);
3651         Datum           datum2 = string_to_datum(str2, datatype);
3652         bool            result;
3653
3654         switch (datatype)
3655         {
3656                 case TEXTOID:
3657                         result = DatumGetBool(DirectFunctionCall2(text_lt,
3658                                                                                                           datum1, datum2));
3659                         break;
3660
3661                 case BPCHAROID:
3662                         result = DatumGetBool(DirectFunctionCall2(bpcharlt,
3663                                                                                                           datum1, datum2));
3664                         break;
3665
3666                 case VARCHAROID:
3667                         result = DatumGetBool(DirectFunctionCall2(varcharlt,
3668                                                                                                           datum1, datum2));
3669                         break;
3670
3671                 case NAMEOID:
3672                         result = DatumGetBool(DirectFunctionCall2(namelt,
3673                                                                                                           datum1, datum2));
3674                         break;
3675
3676                 case BYTEAOID:
3677                         result = DatumGetBool(DirectFunctionCall2(bytealt,
3678                                                                                                           datum1, datum2));
3679                         break;
3680
3681                 default:
3682                         elog(ERROR, "string_lessthan: unexpected datatype %u", datatype);
3683                         result = false;
3684                         break;
3685         }
3686
3687         pfree(DatumGetPointer(datum1));
3688         pfree(DatumGetPointer(datum2));
3689
3690         return result;
3691 }
3692
3693 /* See if there is a binary op of the given name for the given datatype */
3694 /* NB: we assume that only built-in system operators are searched for */
3695 static Oid
3696 find_operator(const char *opname, Oid datatype)
3697 {
3698         return GetSysCacheOid(OPERNAMENSP,
3699                                                   PointerGetDatum(opname),
3700                                                   ObjectIdGetDatum(datatype),
3701                                                   ObjectIdGetDatum(datatype),
3702                                                   ObjectIdGetDatum(PG_CATALOG_NAMESPACE));
3703 }
3704
3705 /*
3706  * Generate a Datum of the appropriate type from a C string.
3707  * Note that all of the supported types are pass-by-ref, so the
3708  * returned value should be pfree'd if no longer needed.
3709  */
3710 static Datum
3711 string_to_datum(const char *str, Oid datatype)
3712 {
3713         Assert(str != NULL);
3714
3715         /*
3716          * We cheat a little by assuming that textin() will do for bpchar and
3717          * varchar constants too...
3718          */
3719         if (datatype == NAMEOID)
3720                 return DirectFunctionCall1(namein, CStringGetDatum(str));
3721         else if (datatype == BYTEAOID)
3722                 return DirectFunctionCall1(byteain, CStringGetDatum(str));
3723         else
3724                 return DirectFunctionCall1(textin, CStringGetDatum(str));
3725 }
3726
3727 /*
3728  * Generate a Const node of the appropriate type from a C string.
3729  */
3730 static Const *
3731 string_to_const(const char *str, Oid datatype)
3732 {
3733         Datum           conval = string_to_datum(str, datatype);
3734
3735         return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
3736                                          conval, false, false);
3737 }
3738
3739 /*-------------------------------------------------------------------------
3740  *
3741  * Index cost estimation functions
3742  *
3743  * genericcostestimate is a general-purpose estimator for use when we
3744  * don't have any better idea about how to estimate.  Index-type-specific
3745  * knowledge can be incorporated in the type-specific routines.
3746  *
3747  *-------------------------------------------------------------------------
3748  */
3749
3750 static void
3751 genericcostestimate(Query *root, RelOptInfo *rel,
3752                                         IndexOptInfo *index, List *indexQuals,
3753                                         Cost *indexStartupCost,
3754                                         Cost *indexTotalCost,
3755                                         Selectivity *indexSelectivity,
3756                                         double *indexCorrelation)
3757 {
3758         double          numIndexTuples;
3759         double          numIndexPages;
3760         QualCost        index_qual_cost;
3761         List       *selectivityQuals = indexQuals;
3762
3763         /*
3764          * If the index is partial, AND the index predicate with the
3765          * explicitly given indexquals to produce a more accurate idea of the
3766          * index restriction.  This may produce redundant clauses, which we
3767          * hope that cnfify and clauselist_selectivity will deal with
3768          * intelligently.
3769          *
3770          * Note that index->indpred and indexQuals are both in implicit-AND form
3771          * to start with, which we have to make explicit to hand to
3772          * canonicalize_qual, and then we get back implicit-AND form again.
3773          */
3774         if (index->indpred != NIL)
3775         {
3776                 Expr       *andedQuals;
3777
3778                 andedQuals = make_ands_explicit(nconc(listCopy(index->indpred),
3779                                                                                           indexQuals));
3780                 selectivityQuals = canonicalize_qual(andedQuals, true);
3781         }
3782
3783         /* Estimate the fraction of main-table tuples that will be visited */
3784         *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
3785                                                                                            lfirsti(rel->relids));
3786
3787         /*
3788          * Estimate the number of tuples that will be visited.  We do it in
3789          * this rather peculiar-looking way in order to get the right answer
3790          * for partial indexes.  We can bound the number of tuples by the
3791          * index size, in any case.
3792          */
3793         numIndexTuples = *indexSelectivity * rel->tuples;
3794
3795         if (numIndexTuples > index->tuples)
3796                 numIndexTuples = index->tuples;
3797
3798         /*
3799          * Always estimate at least one tuple is touched, even when
3800          * indexSelectivity estimate is tiny.
3801          */
3802         if (numIndexTuples < 1.0)
3803                 numIndexTuples = 1.0;
3804
3805         /*
3806          * Estimate the number of index pages that will be retrieved.
3807          *
3808          * For all currently-supported index types, the first page of the index
3809          * is a metadata page, and we should figure on fetching that plus a
3810          * pro-rated fraction of the remaining pages.
3811          */
3812         if (index->pages > 1 && index->tuples > 0)
3813         {
3814                 numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
3815                 numIndexPages += 1;             /* count the metapage too */
3816                 numIndexPages = ceil(numIndexPages);
3817         }
3818         else
3819                 numIndexPages = 1.0;
3820
3821         /*
3822          * Compute the index access cost.
3823          *
3824          * Our generic assumption is that the index pages will be read
3825          * sequentially, so they have cost 1.0 each, not random_page_cost.
3826          * Also, we charge for evaluation of the indexquals at each index
3827          * tuple. All the costs are assumed to be paid incrementally during
3828          * the scan.
3829          */
3830         cost_qual_eval(&index_qual_cost, indexQuals);
3831         *indexStartupCost = index_qual_cost.startup;
3832         *indexTotalCost = numIndexPages +
3833                 (cpu_index_tuple_cost + index_qual_cost.per_tuple) * numIndexTuples;
3834
3835         /*
3836          * Generic assumption about index correlation: there isn't any.
3837          */
3838         *indexCorrelation = 0.0;
3839 }
3840
3841
3842 Datum
3843 btcostestimate(PG_FUNCTION_ARGS)
3844 {
3845         Query      *root = (Query *) PG_GETARG_POINTER(0);
3846         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3847         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3848         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3849         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3850         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3851         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3852         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3853
3854         genericcostestimate(root, rel, index, indexQuals,
3855                                                 indexStartupCost, indexTotalCost,
3856                                                 indexSelectivity, indexCorrelation);
3857
3858         /*
3859          * If it's a functional index, leave the default zero-correlation
3860          * estimate in place.  If not, and if we can get an estimate for the
3861          * first variable's ordering correlation C from pg_statistic, estimate
3862          * the index correlation as C / number-of-columns. (The idea here is
3863          * that multiple columns dilute the importance of the first column's
3864          * ordering, but don't negate it entirely.)
3865          */
3866         if (index->indproc == InvalidOid)
3867         {
3868                 Oid                     relid;
3869                 HeapTuple       tuple;
3870
3871                 relid = getrelid(lfirsti(rel->relids), root->rtable);
3872                 Assert(relid != InvalidOid);
3873                 tuple = SearchSysCache(STATRELATT,
3874                                                            ObjectIdGetDatum(relid),
3875                                                            Int16GetDatum(index->indexkeys[0]),
3876                                                            0, 0);
3877                 if (HeapTupleIsValid(tuple))
3878                 {
3879                         Oid                     typid;
3880                         int32           typmod;
3881                         float4     *numbers;
3882                         int                     nnumbers;
3883
3884                         get_atttypetypmod(relid, index->indexkeys[0],
3885                                                           &typid, &typmod);
3886                         if (get_attstatsslot(tuple, typid, typmod,
3887                                                                  STATISTIC_KIND_CORRELATION,
3888                                                                  index->ordering[0],
3889                                                                  NULL, NULL, &numbers, &nnumbers))
3890                         {
3891                                 double          varCorrelation;
3892                                 int                     nKeys;
3893
3894                                 Assert(nnumbers == 1);
3895                                 varCorrelation = numbers[0];
3896                                 for (nKeys = 1; index->indexkeys[nKeys] != 0; nKeys++)
3897                                          /* skip */ ;
3898
3899                                 *indexCorrelation = varCorrelation / nKeys;
3900
3901                                 free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
3902                         }
3903                         ReleaseSysCache(tuple);
3904                 }
3905         }
3906
3907         PG_RETURN_VOID();
3908 }
3909
3910 Datum
3911 rtcostestimate(PG_FUNCTION_ARGS)
3912 {
3913         Query      *root = (Query *) PG_GETARG_POINTER(0);
3914         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3915         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3916         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3917         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3918         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3919         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3920         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3921
3922         genericcostestimate(root, rel, index, indexQuals,
3923                                                 indexStartupCost, indexTotalCost,
3924                                                 indexSelectivity, indexCorrelation);
3925
3926         PG_RETURN_VOID();
3927 }
3928
3929 Datum
3930 hashcostestimate(PG_FUNCTION_ARGS)
3931 {
3932         Query      *root = (Query *) PG_GETARG_POINTER(0);
3933         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3934         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3935         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3936         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3937         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3938         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3939         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3940
3941         genericcostestimate(root, rel, index, indexQuals,
3942                                                 indexStartupCost, indexTotalCost,
3943                                                 indexSelectivity, indexCorrelation);
3944
3945         PG_RETURN_VOID();
3946 }
3947
3948 Datum
3949 gistcostestimate(PG_FUNCTION_ARGS)
3950 {
3951         Query      *root = (Query *) PG_GETARG_POINTER(0);
3952         RelOptInfo *rel = (RelOptInfo *) PG_GETARG_POINTER(1);
3953         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(2);
3954         List       *indexQuals = (List *) PG_GETARG_POINTER(3);
3955         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
3956         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
3957         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
3958         double     *indexCorrelation = (double *) PG_GETARG_POINTER(7);
3959
3960         genericcostestimate(root, rel, index, indexQuals,
3961                                                 indexStartupCost, indexTotalCost,
3962                                                 indexSelectivity, indexCorrelation);
3963
3964         PG_RETURN_VOID();
3965 }