granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * selfuncs.c
   4  *        Selectivity functions and index cost estimation functions for
   5  *        standard operators and index access methods.
   6  *
   7  *        Selectivity routines are registered in the pg_operator catalog
   8  *        in the "oprrest" and "oprjoin" attributes.
   9  *
  10  *        Index cost functions are registered in the pg_am catalog
  11  *        in the "amcostestimate" attribute.
  12  *
  13  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  14  * Portions Copyright (c) 1994, Regents of the University of California
  15  *
  16  *
  17  * IDENTIFICATION
  18  *        $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.187 2005/07/21 04:41:43 momjian Exp $
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22
  23 /*----------
  24  * Operator selectivity estimation functions are called to estimate the
  25  * selectivity of WHERE clauses whose top-level operator is their operator.
  26  * We divide the problem into two cases:
  27  *              Restriction clause estimation: the clause involves vars of just
  28  *                      one relation.
  29  *              Join clause estimation: the clause involves vars of multiple rels.
  30  * Join selectivity estimation is far more difficult and usually less accurate
  31  * than restriction estimation.
  32  *
  33  * When dealing with the inner scan of a nestloop join, we consider the
  34  * join's joinclauses as restriction clauses for the inner relation, and
  35  * treat vars of the outer relation as parameters (a/k/a constants of unknown
  36  * values).  So, restriction estimators need to be able to accept an argument
  37  * telling which relation is to be treated as the variable.
  38  *
  39  * The call convention for a restriction estimator (oprrest function) is
  40  *
  41  *              Selectivity oprrest (PlannerInfo *root,
  42  *                                                       Oid operator,
  43  *                                                       List *args,
  44  *                                                       int varRelid);
  45  *
  46  * root: general information about the query (rtable and RelOptInfo lists
  47  * are particularly important for the estimator).
  48  * operator: OID of the specific operator in question.
  49  * args: argument list from the operator clause.
  50  * varRelid: if not zero, the relid (rtable index) of the relation to
  51  * be treated as the variable relation.  May be zero if the args list
  52  * is known to contain vars of only one relation.
  53  *
  54  * This is represented at the SQL level (in pg_proc) as
  55  *
  56  *              float8 oprrest (internal, oid, internal, int4);
  57  *
  58  * The call convention for a join estimator (oprjoin function) is similar
  59  * except that varRelid is not needed, and instead the join type is
  60  * supplied:
  61  *
  62  *              Selectivity oprjoin (PlannerInfo *root,
  63  *                                                       Oid operator,
  64  *                                                       List *args,
  65  *                                                       JoinType jointype);
  66  *
  67  *              float8 oprjoin (internal, oid, internal, int2);
  68  *
  69  * (We deliberately make the SQL signature different to facilitate
  70  * catching errors.)
  71  *----------
  72  */
  73
  74 #include "postgres.h"
  75
  76 #include <ctype.h>
  77 #include <math.h>
  78
  79 #include "access/heapam.h"
  80 #include "access/nbtree.h"
  81 #include "access/tuptoaster.h"
  82 #include "catalog/pg_namespace.h"
  83 #include "catalog/pg_opclass.h"
  84 #include "catalog/pg_operator.h"
  85 #include "catalog/pg_proc.h"
  86 #include "catalog/pg_statistic.h"
  87 #include "catalog/pg_type.h"
  88 #include "mb/pg_wchar.h"
  89 #include "nodes/makefuncs.h"
  90 #include "optimizer/clauses.h"
  91 #include "optimizer/cost.h"
  92 #include "optimizer/pathnode.h"
  93 #include "optimizer/paths.h"
  94 #include "optimizer/plancat.h"
  95 #include "optimizer/prep.h"
  96 #include "optimizer/restrictinfo.h"
  97 #include "optimizer/tlist.h"
  98 #include "optimizer/var.h"
  99 #include "parser/parse_expr.h"
 100 #include "parser/parse_func.h"
 101 #include "parser/parse_oper.h"
 102 #include "parser/parsetree.h"
 103 #include "utils/builtins.h"
 104 #include "utils/date.h"
 105 #include "utils/datum.h"
 106 #include "utils/int8.h"
 107 #include "utils/lsyscache.h"
 108 #include "utils/nabstime.h"
 109 #include "utils/pg_locale.h"
 110 #include "utils/selfuncs.h"
 111 #include "utils/syscache.h"
 112
 113
 114 /* Return data from examine_variable and friends */
 115 typedef struct
 116 {
 117         Node       *var;                        /* the Var or expression tree */
 118         RelOptInfo *rel;                        /* Relation, or NULL if not identifiable */
 119         HeapTuple       statsTuple;             /* pg_statistic tuple, or NULL if none */
 120         /* NB: if statsTuple!=NULL, it must be freed when caller is done */
 121         Oid                     vartype;                /* exposed type of expression */
 122         Oid                     atttype;                /* type to pass to get_attstatsslot */
 123         int32           atttypmod;              /* typmod to pass to get_attstatsslot */
 124         bool            isunique;               /* true if matched to a unique index */
 125 } VariableStatData;
 126
 127 #define ReleaseVariableStats(vardata)  \
 128         do { \
 129                 if (HeapTupleIsValid((vardata).statsTuple)) \
 130                         ReleaseSysCache((vardata).statsTuple); \
 131         } while(0)
 132
 133
 134 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 135                                   Datum lobound, Datum hibound, Oid boundstypid,
 136                                   double *scaledlobound, double *scaledhibound);
 137 static double convert_numeric_to_scalar(Datum value, Oid typid);
 138 static void convert_string_to_scalar(unsigned char *value,
 139                                                  double *scaledvalue,
 140                                                  unsigned char *lobound,
 141                                                  double *scaledlobound,
 142                                                  unsigned char *hibound,
 143                                                  double *scaledhibound);
 144 static void convert_bytea_to_scalar(Datum value,
 145                                                 double *scaledvalue,
 146                                                 Datum lobound,
 147                                                 double *scaledlobound,
 148                                                 Datum hibound,
 149                                                 double *scaledhibound);
 150 static double convert_one_string_to_scalar(unsigned char *value,
 151                                                          int rangelo, int rangehi);
 152 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 153                                                         int rangelo, int rangehi);
 154 static unsigned char *convert_string_datum(Datum value, Oid typid);
 155 static double convert_timevalue_to_scalar(Datum value, Oid typid);
 156 static bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
 157                                                  VariableStatData *vardata, Node **other,
 158                                                  bool *varonleft);
 159 static void get_join_variables(PlannerInfo *root, List *args,
 160                                    VariableStatData *vardata1,
 161                                    VariableStatData *vardata2);
 162 static void examine_variable(PlannerInfo *root, Node *node, int varRelid,
 163                                  VariableStatData *vardata);
 164 static double get_variable_numdistinct(VariableStatData *vardata);
 165 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
 166                                          Oid sortop, Datum *max);
 167 static Selectivity prefix_selectivity(PlannerInfo *root, Node *variable,
 168                                    Oid opclass, Const *prefix);
 169 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
 170 static Datum string_to_datum(const char *str, Oid datatype);
 171 static Const *string_to_const(const char *str, Oid datatype);
 172 static Const *string_to_bytea_const(const char *str, size_t str_len);
 173
 174
 175 /*
 176  *              eqsel                   - Selectivity of "=" for any data types.
 177  *
 178  * Note: this routine is also used to estimate selectivity for some
 179  * operators that are not "=" but have comparable selectivity behavior,
 180  * such as "~=" (geometric approximate-match).  Even for "=", we must
 181  * keep in mind that the left and right datatypes may differ.
 182  */
 183 Datum
 184 eqsel(PG_FUNCTION_ARGS)
 185 {
 186         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 187         Oid                     operator = PG_GETARG_OID(1);
 188         List       *args = (List *) PG_GETARG_POINTER(2);
 189         int                     varRelid = PG_GETARG_INT32(3);
 190         VariableStatData vardata;
 191         Node       *other;
 192         bool            varonleft;
 193         Datum      *values;
 194         int                     nvalues;
 195         float4     *numbers;
 196         int                     nnumbers;
 197         double          selec;
 198
 199         /*
 200          * If expression is not variable = something or something = variable,
 201          * then punt and return a default estimate.
 202          */
 203         if (!get_restriction_variable(root, args, varRelid,
 204                                                                   &vardata, &other, &varonleft))
 205                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 206
 207         /*
 208          * If the something is a NULL constant, assume operator is strict and
 209          * return zero, ie, operator will never return TRUE.
 210          */
 211         if (IsA(other, Const) &&
 212                 ((Const *) other)->constisnull)
 213         {
 214                 ReleaseVariableStats(vardata);
 215                 PG_RETURN_FLOAT8(0.0);
 216         }
 217
 218         if (HeapTupleIsValid(vardata.statsTuple))
 219         {
 220                 Form_pg_statistic stats;
 221
 222                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 223
 224                 if (IsA(other, Const))
 225                 {
 226                         /* Variable is being compared to a known non-null constant */
 227                         Datum           constval = ((Const *) other)->constvalue;
 228                         bool            match = false;
 229                         int                     i;
 230
 231                         /*
 232                          * Is the constant "=" to any of the column's most common
 233                          * values?      (Although the given operator may not really be
 234                          * "=", we will assume that seeing whether it returns TRUE is
 235                          * an appropriate test.  If you don't like this, maybe you
 236                          * shouldn't be using eqsel for your operator...)
 237                          */
 238                         if (get_attstatsslot(vardata.statsTuple,
 239                                                                  vardata.atttype, vardata.atttypmod,
 240                                                                  STATISTIC_KIND_MCV, InvalidOid,
 241                                                                  &values, &nvalues,
 242                                                                  &numbers, &nnumbers))
 243                         {
 244                                 FmgrInfo        eqproc;
 245
 246                                 fmgr_info(get_opcode(operator), &eqproc);
 247
 248                                 for (i = 0; i < nvalues; i++)
 249                                 {
 250                                         /* be careful to apply operator right way 'round */
 251                                         if (varonleft)
 252                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 253                                                                                                                    values[i],
 254                                                                                                                    constval));
 255                                         else
 256                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 257                                                                                                                    constval,
 258                                                                                                                    values[i]));
 259                                         if (match)
 260                                                 break;
 261                                 }
 262                         }
 263                         else
 264                         {
 265                                 /* no most-common-value info available */
 266                                 values = NULL;
 267                                 numbers = NULL;
 268                                 i = nvalues = nnumbers = 0;
 269                         }
 270
 271                         if (match)
 272                         {
 273                                 /*
 274                                  * Constant is "=" to this common value.  We know
 275                                  * selectivity exactly (or as exactly as VACUUM could
 276                                  * calculate it, anyway).
 277                                  */
 278                                 selec = numbers[i];
 279                         }
 280                         else
 281                         {
 282                                 /*
 283                                  * Comparison is against a constant that is neither NULL
 284                                  * nor any of the common values.  Its selectivity cannot
 285                                  * be more than this:
 286                                  */
 287                                 double          sumcommon = 0.0;
 288                                 double          otherdistinct;
 289
 290                                 for (i = 0; i < nnumbers; i++)
 291                                         sumcommon += numbers[i];
 292                                 selec = 1.0 - sumcommon - stats->stanullfrac;
 293                                 CLAMP_PROBABILITY(selec);
 294
 295                                 /*
 296                                  * and in fact it's probably a good deal less. We
 297                                  * approximate that all the not-common values share this
 298                                  * remaining fraction equally, so we divide by the number
 299                                  * of other distinct values.
 300                                  */
 301                                 otherdistinct = get_variable_numdistinct(&vardata)
 302                                         - nnumbers;
 303                                 if (otherdistinct > 1)
 304                                         selec /= otherdistinct;
 305
 306                                 /*
 307                                  * Another cross-check: selectivity shouldn't be estimated
 308                                  * as more than the least common "most common value".
 309                                  */
 310                                 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
 311                                         selec = numbers[nnumbers - 1];
 312                         }
 313
 314                         free_attstatsslot(vardata.atttype, values, nvalues,
 315                                                           numbers, nnumbers);
 316                 }
 317                 else
 318                 {
 319                         double          ndistinct;
 320
 321                         /*
 322                          * Search is for a value that we do not know a priori, but we
 323                          * will assume it is not NULL.  Estimate the selectivity as
 324                          * non-null fraction divided by number of distinct values, so
 325                          * that we get a result averaged over all possible values
 326                          * whether common or uncommon.  (Essentially, we are assuming
 327                          * that the not-yet-known comparison value is equally likely
 328                          * to be any of the possible values, regardless of their
 329                          * frequency in the table.      Is that a good idea?)
 330                          */
 331                         selec = 1.0 - stats->stanullfrac;
 332                         ndistinct = get_variable_numdistinct(&vardata);
 333                         if (ndistinct > 1)
 334                                 selec /= ndistinct;
 335
 336                         /*
 337                          * Cross-check: selectivity should never be estimated as more
 338                          * than the most common value's.
 339                          */
 340                         if (get_attstatsslot(vardata.statsTuple,
 341                                                                  vardata.atttype, vardata.atttypmod,
 342                                                                  STATISTIC_KIND_MCV, InvalidOid,
 343                                                                  NULL, NULL,
 344                                                                  &numbers, &nnumbers))
 345                         {
 346                                 if (nnumbers > 0 && selec > numbers[0])
 347                                         selec = numbers[0];
 348                                 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
 349                         }
 350                 }
 351         }
 352         else
 353         {
 354                 /*
 355                  * No VACUUM ANALYZE stats available, so make a guess using
 356                  * estimated number of distinct values and assuming they are
 357                  * equally common.      (The guess is unlikely to be very good, but we
 358                  * do know a few special cases.)
 359                  */
 360                 selec = 1.0 / get_variable_numdistinct(&vardata);
 361         }
 362
 363         ReleaseVariableStats(vardata);
 364
 365         /* result should be in range, but make sure... */
 366         CLAMP_PROBABILITY(selec);
 367
 368         PG_RETURN_FLOAT8((float8) selec);
 369 }
 370
 371 /*
 372  *              neqsel                  - Selectivity of "!=" for any data types.
 373  *
 374  * This routine is also used for some operators that are not "!="
 375  * but have comparable selectivity behavior.  See above comments
 376  * for eqsel().
 377  */
 378 Datum
 379 neqsel(PG_FUNCTION_ARGS)
 380 {
 381         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 382         Oid                     operator = PG_GETARG_OID(1);
 383         List       *args = (List *) PG_GETARG_POINTER(2);
 384         int                     varRelid = PG_GETARG_INT32(3);
 385         Oid                     eqop;
 386         float8          result;
 387
 388         /*
 389          * We want 1 - eqsel() where the equality operator is the one
 390          * associated with this != operator, that is, its negator.
 391          */
 392         eqop = get_negator(operator);
 393         if (eqop)
 394         {
 395                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 396                                                                                                         PointerGetDatum(root),
 397                                                                                                   ObjectIdGetDatum(eqop),
 398                                                                                                         PointerGetDatum(args),
 399                                                                                            Int32GetDatum(varRelid)));
 400         }
 401         else
 402         {
 403                 /* Use default selectivity (should we raise an error instead?) */
 404                 result = DEFAULT_EQ_SEL;
 405         }
 406         result = 1.0 - result;
 407         PG_RETURN_FLOAT8(result);
 408 }
 409
 410 /*
 411  *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
 412  *
 413  * This is the guts of both scalarltsel and scalargtsel.  The caller has
 414  * commuted the clause, if necessary, so that we can treat the variable as
 415  * being on the left.  The caller must also make sure that the other side
 416  * of the clause is a non-null Const, and dissect same into a value and
 417  * datatype.
 418  *
 419  * This routine works for any datatype (or pair of datatypes) known to
 420  * convert_to_scalar().  If it is applied to some other datatype,
 421  * it will return a default estimate.
 422  */
 423 static double
 424 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 425                           VariableStatData *vardata, Datum constval, Oid consttype)
 426 {
 427         Form_pg_statistic stats;
 428         FmgrInfo        opproc;
 429         Datum      *values;
 430         int                     nvalues;
 431         float4     *numbers;
 432         int                     nnumbers;
 433         double          mcv_selec,
 434                                 hist_selec,
 435                                 sumcommon;
 436         double          selec;
 437         int                     i;
 438
 439         if (!HeapTupleIsValid(vardata->statsTuple))
 440         {
 441                 /* no stats available, so default result */
 442                 return DEFAULT_INEQ_SEL;
 443         }
 444         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 445
 446         fmgr_info(get_opcode(operator), &opproc);
 447
 448         /*
 449          * If we have most-common-values info, add up the fractions of the MCV
 450          * entries that satisfy MCV OP CONST.  These fractions contribute
 451          * directly to the result selectivity.  Also add up the total fraction
 452          * represented by MCV entries.
 453          */
 454         mcv_selec = 0.0;
 455         sumcommon = 0.0;
 456
 457         if (get_attstatsslot(vardata->statsTuple,
 458                                                  vardata->atttype, vardata->atttypmod,
 459                                                  STATISTIC_KIND_MCV, InvalidOid,
 460                                                  &values, &nvalues,
 461                                                  &numbers, &nnumbers))
 462         {
 463                 for (i = 0; i < nvalues; i++)
 464                 {
 465                         if (DatumGetBool(FunctionCall2(&opproc,
 466                                                                                    values[i],
 467                                                                                    constval)))
 468                                 mcv_selec += numbers[i];
 469                         sumcommon += numbers[i];
 470                 }
 471                 free_attstatsslot(vardata->atttype, values, nvalues,
 472                                                   numbers, nnumbers);
 473         }
 474
 475         /*
 476          * If there is a histogram, determine which bin the constant falls in,
 477          * and compute the resulting contribution to selectivity.
 478          *
 479          * Someday, VACUUM might store more than one histogram per rel/att,
 480          * corresponding to more than one possible sort ordering defined for
 481          * the column type.  However, to make that work we will need to figure
 482          * out which staop to search for --- it's not necessarily the one we
 483          * have at hand!  (For example, we might have a '<=' operator rather
 484          * than the '<' operator that will appear in staop.)  For now, assume
 485          * that whatever appears in pg_statistic is sorted the same way our
 486          * operator sorts, or the reverse way if isgt is TRUE.
 487          */
 488         hist_selec = 0.0;
 489
 490         if (get_attstatsslot(vardata->statsTuple,
 491                                                  vardata->atttype, vardata->atttypmod,
 492                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 493                                                  &values, &nvalues,
 494                                                  NULL, NULL))
 495         {
 496                 if (nvalues > 1)
 497                 {
 498                         double          histfrac;
 499                         bool            ltcmp;
 500
 501                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 502                                                                                            values[0],
 503                                                                                            constval));
 504                         if (isgt)
 505                                 ltcmp = !ltcmp;
 506                         if (!ltcmp)
 507                         {
 508                                 /* Constant is below lower histogram boundary. */
 509                                 histfrac = 0.0;
 510                         }
 511                         else
 512                         {
 513                                 /*
 514                                  * Scan to find proper location.  This could be made
 515                                  * faster by using a binary-search method, but it's
 516                                  * probably not worth the trouble for typical histogram
 517                                  * sizes.
 518                                  */
 519                                 for (i = 1; i < nvalues; i++)
 520                                 {
 521                                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 522                                                                                                            values[i],
 523                                                                                                            constval));
 524                                         if (isgt)
 525                                                 ltcmp = !ltcmp;
 526                                         if (!ltcmp)
 527                                                 break;
 528                                 }
 529                                 if (i >= nvalues)
 530                                 {
 531                                         /* Constant is above upper histogram boundary. */
 532                                         histfrac = 1.0;
 533                                 }
 534                                 else
 535                                 {
 536                                         double          val,
 537                                                                 high,
 538                                                                 low;
 539                                         double          binfrac;
 540
 541                                         /*
 542                                          * We have values[i-1] < constant < values[i].
 543                                          *
 544                                          * Convert the constant and the two nearest bin boundary
 545                                          * values to a uniform comparison scale, and do a
 546                                          * linear interpolation within this bin.
 547                                          */
 548                                         if (convert_to_scalar(constval, consttype, &val,
 549                                                                                   values[i - 1], values[i],
 550                                                                                   vardata->vartype,
 551                                                                                   &low, &high))
 552                                         {
 553                                                 if (high <= low)
 554                                                 {
 555                                                         /* cope if bin boundaries appear identical */
 556                                                         binfrac = 0.5;
 557                                                 }
 558                                                 else if (val <= low)
 559                                                         binfrac = 0.0;
 560                                                 else if (val >= high)
 561                                                         binfrac = 1.0;
 562                                                 else
 563                                                 {
 564                                                         binfrac = (val - low) / (high - low);
 565
 566                                                         /*
 567                                                          * Watch out for the possibility that we got a
 568                                                          * NaN or Infinity from the division.  This
 569                                                          * can happen despite the previous checks, if
 570                                                          * for example "low" is -Infinity.
 571                                                          */
 572                                                         if (isnan(binfrac) ||
 573                                                                 binfrac < 0.0 || binfrac > 1.0)
 574                                                                 binfrac = 0.5;
 575                                                 }
 576                                         }
 577                                         else
 578                                         {
 579                                                 /*
 580                                                  * Ideally we'd produce an error here, on the
 581                                                  * grounds that the given operator shouldn't have
 582                                                  * scalarXXsel registered as its selectivity func
 583                                                  * unless we can deal with its operand types.  But
 584                                                  * currently, all manner of stuff is invoking
 585                                                  * scalarXXsel, so give a default estimate until
 586                                                  * that can be fixed.
 587                                                  */
 588                                                 binfrac = 0.5;
 589                                         }
 590
 591                                         /*
 592                                          * Now, compute the overall selectivity across the
 593                                          * values represented by the histogram.  We have i-1
 594                                          * full bins and binfrac partial bin below the
 595                                          * constant.
 596                                          */
 597                                         histfrac = (double) (i - 1) + binfrac;
 598                                         histfrac /= (double) (nvalues - 1);
 599                                 }
 600                         }
 601
 602                         /*
 603                          * Now histfrac = fraction of histogram entries below the
 604                          * constant.
 605                          *
 606                          * Account for "<" vs ">"
 607                          */
 608                         hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 609
 610                         /*
 611                          * The histogram boundaries are only approximate to begin
 612                          * with, and may well be out of date anyway.  Therefore, don't
 613                          * believe extremely small or large selectivity estimates.
 614                          */
 615                         if (hist_selec < 0.0001)
 616                                 hist_selec = 0.0001;
 617                         else if (hist_selec > 0.9999)
 618                                 hist_selec = 0.9999;
 619                 }
 620
 621                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 622         }
 623
 624         /*
 625          * Now merge the results from the MCV and histogram calculations,
 626          * realizing that the histogram covers only the non-null values that
 627          * are not listed in MCV.
 628          */
 629         selec = 1.0 - stats->stanullfrac - sumcommon;
 630
 631         if (hist_selec > 0.0)
 632                 selec *= hist_selec;
 633         else
 634         {
 635                 /*
 636                  * If no histogram but there are values not accounted for by MCV,
 637                  * arbitrarily assume half of them will match.
 638                  */
 639                 selec *= 0.5;
 640         }
 641
 642         selec += mcv_selec;
 643
 644         /* result should be in range, but make sure... */
 645         CLAMP_PROBABILITY(selec);
 646
 647         return selec;
 648 }
 649
 650 /*
 651  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 652  */
 653 Datum
 654 scalarltsel(PG_FUNCTION_ARGS)
 655 {
 656         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 657         Oid                     operator = PG_GETARG_OID(1);
 658         List       *args = (List *) PG_GETARG_POINTER(2);
 659         int                     varRelid = PG_GETARG_INT32(3);
 660         VariableStatData vardata;
 661         Node       *other;
 662         bool            varonleft;
 663         Datum           constval;
 664         Oid                     consttype;
 665         bool            isgt;
 666         double          selec;
 667
 668         /*
 669          * If expression is not variable op something or something op
 670          * variable, then punt and return a default estimate.
 671          */
 672         if (!get_restriction_variable(root, args, varRelid,
 673                                                                   &vardata, &other, &varonleft))
 674                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 675
 676         /*
 677          * Can't do anything useful if the something is not a constant,
 678          * either.
 679          */
 680         if (!IsA(other, Const))
 681         {
 682                 ReleaseVariableStats(vardata);
 683                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 684         }
 685
 686         /*
 687          * If the constant is NULL, assume operator is strict and return zero,
 688          * ie, operator will never return TRUE.
 689          */
 690         if (((Const *) other)->constisnull)
 691         {
 692                 ReleaseVariableStats(vardata);
 693                 PG_RETURN_FLOAT8(0.0);
 694         }
 695         constval = ((Const *) other)->constvalue;
 696         consttype = ((Const *) other)->consttype;
 697
 698         /*
 699          * Force the var to be on the left to simplify logic in scalarineqsel.
 700          */
 701         if (varonleft)
 702         {
 703                 /* we have var < other */
 704                 isgt = false;
 705         }
 706         else
 707         {
 708                 /* we have other < var, commute to make var > other */
 709                 operator = get_commutator(operator);
 710                 if (!operator)
 711                 {
 712                         /* Use default selectivity (should we raise an error instead?) */
 713                         ReleaseVariableStats(vardata);
 714                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 715                 }
 716                 isgt = true;
 717         }
 718
 719         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 720
 721         ReleaseVariableStats(vardata);
 722
 723         PG_RETURN_FLOAT8((float8) selec);
 724 }
 725
 726 /*
 727  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 728  */
 729 Datum
 730 scalargtsel(PG_FUNCTION_ARGS)
 731 {
 732         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 733         Oid                     operator = PG_GETARG_OID(1);
 734         List       *args = (List *) PG_GETARG_POINTER(2);
 735         int                     varRelid = PG_GETARG_INT32(3);
 736         VariableStatData vardata;
 737         Node       *other;
 738         bool            varonleft;
 739         Datum           constval;
 740         Oid                     consttype;
 741         bool            isgt;
 742         double          selec;
 743
 744         /*
 745          * If expression is not variable op something or something op
 746          * variable, then punt and return a default estimate.
 747          */
 748         if (!get_restriction_variable(root, args, varRelid,
 749                                                                   &vardata, &other, &varonleft))
 750                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 751
 752         /*
 753          * Can't do anything useful if the something is not a constant,
 754          * either.
 755          */
 756         if (!IsA(other, Const))
 757         {
 758                 ReleaseVariableStats(vardata);
 759                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 760         }
 761
 762         /*
 763          * If the constant is NULL, assume operator is strict and return zero,
 764          * ie, operator will never return TRUE.
 765          */
 766         if (((Const *) other)->constisnull)
 767         {
 768                 ReleaseVariableStats(vardata);
 769                 PG_RETURN_FLOAT8(0.0);
 770         }
 771         constval = ((Const *) other)->constvalue;
 772         consttype = ((Const *) other)->consttype;
 773
 774         /*
 775          * Force the var to be on the left to simplify logic in scalarineqsel.
 776          */
 777         if (varonleft)
 778         {
 779                 /* we have var > other */
 780                 isgt = true;
 781         }
 782         else
 783         {
 784                 /* we have other > var, commute to make var < other */
 785                 operator = get_commutator(operator);
 786                 if (!operator)
 787                 {
 788                         /* Use default selectivity (should we raise an error instead?) */
 789                         ReleaseVariableStats(vardata);
 790                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 791                 }
 792                 isgt = false;
 793         }
 794
 795         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 796
 797         ReleaseVariableStats(vardata);
 798
 799         PG_RETURN_FLOAT8((float8) selec);
 800 }
 801
 802 /*
 803  * patternsel                   - Generic code for pattern-match selectivity.
 804  */
 805 static double
 806 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 807 {
 808         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 809
 810 #ifdef NOT_USED
 811         Oid                     operator = PG_GETARG_OID(1);
 812 #endif
 813         List       *args = (List *) PG_GETARG_POINTER(2);
 814         int                     varRelid = PG_GETARG_INT32(3);
 815         VariableStatData vardata;
 816         Node       *variable;
 817         Node       *other;
 818         bool            varonleft;
 819         Datum           constval;
 820         Oid                     consttype;
 821         Oid                     vartype;
 822         Oid                     opclass;
 823         Pattern_Prefix_Status pstatus;
 824         Const      *patt = NULL;
 825         Const      *prefix = NULL;
 826         Const      *rest = NULL;
 827         double          result;
 828
 829         /*
 830          * If expression is not variable op constant, then punt and return a
 831          * default estimate.
 832          */
 833         if (!get_restriction_variable(root, args, varRelid,
 834                                                                   &vardata, &other, &varonleft))
 835                 return DEFAULT_MATCH_SEL;
 836         if (!varonleft || !IsA(other, Const))
 837         {
 838                 ReleaseVariableStats(vardata);
 839                 return DEFAULT_MATCH_SEL;
 840         }
 841         variable = (Node *) linitial(args);
 842
 843         /*
 844          * If the constant is NULL, assume operator is strict and return zero,
 845          * ie, operator will never return TRUE.
 846          */
 847         if (((Const *) other)->constisnull)
 848         {
 849                 ReleaseVariableStats(vardata);
 850                 return 0.0;
 851         }
 852         constval = ((Const *) other)->constvalue;
 853         consttype = ((Const *) other)->consttype;
 854
 855         /*
 856          * The right-hand const is type text or bytea for all supported
 857          * operators.  We do not expect to see binary-compatible types here,
 858          * since const-folding should have relabeled the const to exactly
 859          * match the operator's declared type.
 860          */
 861         if (consttype != TEXTOID && consttype != BYTEAOID)
 862         {
 863                 ReleaseVariableStats(vardata);
 864                 return DEFAULT_MATCH_SEL;
 865         }
 866
 867         /*
 868          * Similarly, the exposed type of the left-hand side should be one
 869          * of those we know.  (Do not look at vardata.atttype, which might be
 870          * something binary-compatible but different.)  We can use it to choose
 871          * the index opclass from which we must draw the comparison operators.
 872          *
 873          * NOTE: It would be more correct to use the PATTERN opclasses than the
 874          * simple ones, but at the moment ANALYZE will not generate statistics
 875          * for the PATTERN operators.  But our results are so approximate
 876          * anyway that it probably hardly matters.
 877          */
 878         vartype = vardata.vartype;
 879
 880         switch (vartype)
 881         {
 882                 case TEXTOID:
 883                         opclass = TEXT_BTREE_OPS_OID;
 884                         break;
 885                 case VARCHAROID:
 886                         opclass = VARCHAR_BTREE_OPS_OID;
 887                         break;
 888                 case BPCHAROID:
 889                         opclass = BPCHAR_BTREE_OPS_OID;
 890                         break;
 891                 case NAMEOID:
 892                         opclass = NAME_BTREE_OPS_OID;
 893                         break;
 894                 case BYTEAOID:
 895                         opclass = BYTEA_BTREE_OPS_OID;
 896                         break;
 897                 default:
 898                         ReleaseVariableStats(vardata);
 899                         return DEFAULT_MATCH_SEL;
 900         }
 901
 902         /* divide pattern into fixed prefix and remainder */
 903         patt = (Const *) other;
 904         pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
 905
 906         /*
 907          * If necessary, coerce the prefix constant to the right type. (The
 908          * "rest" constant need not be changed.)
 909          */
 910         if (prefix && prefix->consttype != vartype)
 911         {
 912                 char       *prefixstr;
 913
 914                 switch (prefix->consttype)
 915                 {
 916                         case TEXTOID:
 917                                 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
 918                                                                                                         prefix->constvalue));
 919                                 break;
 920                         case BYTEAOID:
 921                                 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
 922                                                                                                         prefix->constvalue));
 923                                 break;
 924                         default:
 925                                 elog(ERROR, "unrecognized consttype: %u",
 926                                          prefix->consttype);
 927                                 ReleaseVariableStats(vardata);
 928                                 return DEFAULT_MATCH_SEL;
 929                 }
 930                 prefix = string_to_const(prefixstr, vartype);
 931                 pfree(prefixstr);
 932         }
 933
 934         if (pstatus == Pattern_Prefix_Exact)
 935         {
 936                 /*
 937                  * Pattern specifies an exact match, so pretend operator is '='
 938                  */
 939                 Oid                     eqopr = get_opclass_member(opclass, InvalidOid,
 940                                                                                            BTEqualStrategyNumber);
 941                 List       *eqargs;
 942
 943                 if (eqopr == InvalidOid)
 944                         elog(ERROR, "no = operator for opclass %u", opclass);
 945                 eqargs = list_make2(variable, prefix);
 946                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 947                                                                                                         PointerGetDatum(root),
 948                                                                                                  ObjectIdGetDatum(eqopr),
 949                                                                                                  PointerGetDatum(eqargs),
 950                                                                                            Int32GetDatum(varRelid)));
 951         }
 952         else
 953         {
 954                 /*
 955                  * Not exact-match pattern.  We estimate selectivity of the fixed
 956                  * prefix and remainder of pattern separately, then combine the
 957                  * two.
 958                  */
 959                 Selectivity prefixsel;
 960                 Selectivity restsel;
 961                 Selectivity selec;
 962
 963                 if (pstatus == Pattern_Prefix_Partial)
 964                         prefixsel = prefix_selectivity(root, variable, opclass, prefix);
 965                 else
 966                         prefixsel = 1.0;
 967                 restsel = pattern_selectivity(rest, ptype);
 968                 selec = prefixsel * restsel;
 969                 /* result should be in range, but make sure... */
 970                 CLAMP_PROBABILITY(selec);
 971                 result = selec;
 972         }
 973
 974         if (prefix)
 975         {
 976                 pfree(DatumGetPointer(prefix->constvalue));
 977                 pfree(prefix);
 978         }
 979
 980         ReleaseVariableStats(vardata);
 981
 982         return result;
 983 }
 984
 985 /*
 986  *              regexeqsel              - Selectivity of regular-expression pattern match.
 987  */
 988 Datum
 989 regexeqsel(PG_FUNCTION_ARGS)
 990 {
 991         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
 992 }
 993
 994 /*
 995  *              icregexeqsel    - Selectivity of case-insensitive regex match.
 996  */
 997 Datum
 998 icregexeqsel(PG_FUNCTION_ARGS)
 999 {
1000         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1001 }
1002
1003 /*
1004  *              likesel                 - Selectivity of LIKE pattern match.
1005  */
1006 Datum
1007 likesel(PG_FUNCTION_ARGS)
1008 {
1009         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1010 }
1011
1012 /*
1013  *              iclikesel                       - Selectivity of ILIKE pattern match.
1014  */
1015 Datum
1016 iclikesel(PG_FUNCTION_ARGS)
1017 {
1018         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1019 }
1020
1021 /*
1022  *              regexnesel              - Selectivity of regular-expression pattern non-match.
1023  */
1024 Datum
1025 regexnesel(PG_FUNCTION_ARGS)
1026 {
1027         double          result;
1028
1029         result = patternsel(fcinfo, Pattern_Type_Regex);
1030         result = 1.0 - result;
1031         PG_RETURN_FLOAT8(result);
1032 }
1033
1034 /*
1035  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
1036  */
1037 Datum
1038 icregexnesel(PG_FUNCTION_ARGS)
1039 {
1040         double          result;
1041
1042         result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1043         result = 1.0 - result;
1044         PG_RETURN_FLOAT8(result);
1045 }
1046
1047 /*
1048  *              nlikesel                - Selectivity of LIKE pattern non-match.
1049  */
1050 Datum
1051 nlikesel(PG_FUNCTION_ARGS)
1052 {
1053         double          result;
1054
1055         result = patternsel(fcinfo, Pattern_Type_Like);
1056         result = 1.0 - result;
1057         PG_RETURN_FLOAT8(result);
1058 }
1059
1060 /*
1061  *              icnlikesel              - Selectivity of ILIKE pattern non-match.
1062  */
1063 Datum
1064 icnlikesel(PG_FUNCTION_ARGS)
1065 {
1066         double          result;
1067
1068         result = patternsel(fcinfo, Pattern_Type_Like_IC);
1069         result = 1.0 - result;
1070         PG_RETURN_FLOAT8(result);
1071 }
1072
1073 /*
1074  *              booltestsel             - Selectivity of BooleanTest Node.
1075  */
1076 Selectivity
1077 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1078                         int varRelid, JoinType jointype)
1079 {
1080         VariableStatData vardata;
1081         double          selec;
1082
1083         examine_variable(root, arg, varRelid, &vardata);
1084
1085         if (HeapTupleIsValid(vardata.statsTuple))
1086         {
1087                 Form_pg_statistic stats;
1088                 double          freq_null;
1089                 Datum      *values;
1090                 int                     nvalues;
1091                 float4     *numbers;
1092                 int                     nnumbers;
1093
1094                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1095                 freq_null = stats->stanullfrac;
1096
1097                 if (get_attstatsslot(vardata.statsTuple,
1098                                                          vardata.atttype, vardata.atttypmod,
1099                                                          STATISTIC_KIND_MCV, InvalidOid,
1100                                                          &values, &nvalues,
1101                                                          &numbers, &nnumbers)
1102                         && nnumbers > 0)
1103                 {
1104                         double          freq_true;
1105                         double          freq_false;
1106
1107                         /*
1108                          * Get first MCV frequency and derive frequency for true.
1109                          */
1110                         if (DatumGetBool(values[0]))
1111                                 freq_true = numbers[0];
1112                         else
1113                                 freq_true = 1.0 - numbers[0] - freq_null;
1114
1115                         /*
1116                          * Next derive frequency for false. Then use these as
1117                          * appropriate to derive frequency for each case.
1118                          */
1119                         freq_false = 1.0 - freq_true - freq_null;
1120
1121                         switch (booltesttype)
1122                         {
1123                                 case IS_UNKNOWN:
1124                                         /* select only NULL values */
1125                                         selec = freq_null;
1126                                         break;
1127                                 case IS_NOT_UNKNOWN:
1128                                         /* select non-NULL values */
1129                                         selec = 1.0 - freq_null;
1130                                         break;
1131                                 case IS_TRUE:
1132                                         /* select only TRUE values */
1133                                         selec = freq_true;
1134                                         break;
1135                                 case IS_NOT_TRUE:
1136                                         /* select non-TRUE values */
1137                                         selec = 1.0 - freq_true;
1138                                         break;
1139                                 case IS_FALSE:
1140                                         /* select only FALSE values */
1141                                         selec = freq_false;
1142                                         break;
1143                                 case IS_NOT_FALSE:
1144                                         /* select non-FALSE values */
1145                                         selec = 1.0 - freq_false;
1146                                         break;
1147                                 default:
1148                                         elog(ERROR, "unrecognized booltesttype: %d",
1149                                                  (int) booltesttype);
1150                                         selec = 0.0;    /* Keep compiler quiet */
1151                                         break;
1152                         }
1153
1154                         free_attstatsslot(vardata.atttype, values, nvalues,
1155                                                           numbers, nnumbers);
1156                 }
1157                 else
1158                 {
1159                         /*
1160                          * No most-common-value info available. Still have null
1161                          * fraction information, so use it for IS [NOT] UNKNOWN.
1162                          * Otherwise adjust for null fraction and assume an even split
1163                          * for boolean tests.
1164                          */
1165                         switch (booltesttype)
1166                         {
1167                                 case IS_UNKNOWN:
1168
1169                                         /*
1170                                          * Use freq_null directly.
1171                                          */
1172                                         selec = freq_null;
1173                                         break;
1174                                 case IS_NOT_UNKNOWN:
1175
1176                                         /*
1177                                          * Select not unknown (not null) values. Calculate
1178                                          * from freq_null.
1179                                          */
1180                                         selec = 1.0 - freq_null;
1181                                         break;
1182                                 case IS_TRUE:
1183                                 case IS_NOT_TRUE:
1184                                 case IS_FALSE:
1185                                 case IS_NOT_FALSE:
1186                                         selec = (1.0 - freq_null) / 2.0;
1187                                         break;
1188                                 default:
1189                                         elog(ERROR, "unrecognized booltesttype: %d",
1190                                                  (int) booltesttype);
1191                                         selec = 0.0;    /* Keep compiler quiet */
1192                                         break;
1193                         }
1194                 }
1195         }
1196         else
1197         {
1198                 /*
1199                  * If we can't get variable statistics for the argument, perhaps
1200                  * clause_selectivity can do something with it.  We ignore the
1201                  * possibility of a NULL value when using clause_selectivity, and
1202                  * just assume the value is either TRUE or FALSE.
1203                  */
1204                 switch (booltesttype)
1205                 {
1206                         case IS_UNKNOWN:
1207                                 selec = DEFAULT_UNK_SEL;
1208                                 break;
1209                         case IS_NOT_UNKNOWN:
1210                                 selec = DEFAULT_NOT_UNK_SEL;
1211                                 break;
1212                         case IS_TRUE:
1213                         case IS_NOT_FALSE:
1214                                 selec = (double) clause_selectivity(root, arg,
1215                                                                                                         varRelid, jointype);
1216                                 break;
1217                         case IS_FALSE:
1218                         case IS_NOT_TRUE:
1219                                 selec = 1.0 - (double) clause_selectivity(root, arg,
1220                                                                                                          varRelid, jointype);
1221                                 break;
1222                         default:
1223                                 elog(ERROR, "unrecognized booltesttype: %d",
1224                                          (int) booltesttype);
1225                                 selec = 0.0;    /* Keep compiler quiet */
1226                                 break;
1227                 }
1228         }
1229
1230         ReleaseVariableStats(vardata);
1231
1232         /* result should be in range, but make sure... */
1233         CLAMP_PROBABILITY(selec);
1234
1235         return (Selectivity) selec;
1236 }
1237
1238 /*
1239  *              nulltestsel             - Selectivity of NullTest Node.
1240  */
1241 Selectivity
1242 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1243                         Node *arg, int varRelid)
1244 {
1245         VariableStatData vardata;
1246         double          selec;
1247
1248         examine_variable(root, arg, varRelid, &vardata);
1249
1250         if (HeapTupleIsValid(vardata.statsTuple))
1251         {
1252                 Form_pg_statistic stats;
1253                 double          freq_null;
1254
1255                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1256                 freq_null = stats->stanullfrac;
1257
1258                 switch (nulltesttype)
1259                 {
1260                         case IS_NULL:
1261
1262                                 /*
1263                                  * Use freq_null directly.
1264                                  */
1265                                 selec = freq_null;
1266                                 break;
1267                         case IS_NOT_NULL:
1268
1269                                 /*
1270                                  * Select not unknown (not null) values. Calculate from
1271                                  * freq_null.
1272                                  */
1273                                 selec = 1.0 - freq_null;
1274                                 break;
1275                         default:
1276                                 elog(ERROR, "unrecognized nulltesttype: %d",
1277                                          (int) nulltesttype);
1278                                 return (Selectivity) 0; /* keep compiler quiet */
1279                 }
1280         }
1281         else
1282         {
1283                 /*
1284                  * No VACUUM ANALYZE stats available, so make a guess
1285                  */
1286                 switch (nulltesttype)
1287                 {
1288                         case IS_NULL:
1289                                 selec = DEFAULT_UNK_SEL;
1290                                 break;
1291                         case IS_NOT_NULL:
1292                                 selec = DEFAULT_NOT_UNK_SEL;
1293                                 break;
1294                         default:
1295                                 elog(ERROR, "unrecognized nulltesttype: %d",
1296                                          (int) nulltesttype);
1297                                 return (Selectivity) 0; /* keep compiler quiet */
1298                 }
1299         }
1300
1301         ReleaseVariableStats(vardata);
1302
1303         /* result should be in range, but make sure... */
1304         CLAMP_PROBABILITY(selec);
1305
1306         return (Selectivity) selec;
1307 }
1308
1309 /*
1310  *              eqjoinsel               - Join selectivity of "="
1311  */
1312 Datum
1313 eqjoinsel(PG_FUNCTION_ARGS)
1314 {
1315         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1316         Oid                     operator = PG_GETARG_OID(1);
1317         List       *args = (List *) PG_GETARG_POINTER(2);
1318         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1319         double          selec;
1320         VariableStatData vardata1;
1321         VariableStatData vardata2;
1322         double          nd1;
1323         double          nd2;
1324         Form_pg_statistic stats1 = NULL;
1325         Form_pg_statistic stats2 = NULL;
1326         bool            have_mcvs1 = false;
1327         Datum      *values1 = NULL;
1328         int                     nvalues1 = 0;
1329         float4     *numbers1 = NULL;
1330         int                     nnumbers1 = 0;
1331         bool            have_mcvs2 = false;
1332         Datum      *values2 = NULL;
1333         int                     nvalues2 = 0;
1334         float4     *numbers2 = NULL;
1335         int                     nnumbers2 = 0;
1336
1337         get_join_variables(root, args, &vardata1, &vardata2);
1338
1339         nd1 = get_variable_numdistinct(&vardata1);
1340         nd2 = get_variable_numdistinct(&vardata2);
1341
1342         if (HeapTupleIsValid(vardata1.statsTuple))
1343         {
1344                 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1345                 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1346                                                                           vardata1.atttype,
1347                                                                           vardata1.atttypmod,
1348                                                                           STATISTIC_KIND_MCV,
1349                                                                           InvalidOid,
1350                                                                           &values1, &nvalues1,
1351                                                                           &numbers1, &nnumbers1);
1352         }
1353
1354         if (HeapTupleIsValid(vardata2.statsTuple))
1355         {
1356                 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1357                 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1358                                                                           vardata2.atttype,
1359                                                                           vardata2.atttypmod,
1360                                                                           STATISTIC_KIND_MCV,
1361                                                                           InvalidOid,
1362                                                                           &values2, &nvalues2,
1363                                                                           &numbers2, &nnumbers2);
1364         }
1365
1366         if (have_mcvs1 && have_mcvs2)
1367         {
1368                 /*
1369                  * We have most-common-value lists for both relations.  Run
1370                  * through the lists to see which MCVs actually join to each other
1371                  * with the given operator.  This allows us to determine the exact
1372                  * join selectivity for the portion of the relations represented
1373                  * by the MCV lists.  We still have to estimate for the remaining
1374                  * population, but in a skewed distribution this gives us a big
1375                  * leg up in accuracy.  For motivation see the analysis in Y.
1376                  * Ioannidis and S. Christodoulakis, "On the propagation of errors
1377                  * in the size of join results", Technical Report 1018, Computer
1378                  * Science Dept., University of Wisconsin, Madison, March 1991
1379                  * (available from ftp.cs.wisc.edu).
1380                  */
1381                 FmgrInfo        eqproc;
1382                 bool       *hasmatch1;
1383                 bool       *hasmatch2;
1384                 double          nullfrac1 = stats1->stanullfrac;
1385                 double          nullfrac2 = stats2->stanullfrac;
1386                 double          matchprodfreq,
1387                                         matchfreq1,
1388                                         matchfreq2,
1389                                         unmatchfreq1,
1390                                         unmatchfreq2,
1391                                         otherfreq1,
1392                                         otherfreq2,
1393                                         totalsel1,
1394                                         totalsel2;
1395                 int                     i,
1396                                         nmatches;
1397
1398                 fmgr_info(get_opcode(operator), &eqproc);
1399                 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1400                 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1401
1402                 /*
1403                  * If we are doing any variant of JOIN_IN, pretend all the values
1404                  * of the righthand relation are unique (ie, act as if it's been
1405                  * DISTINCT'd).
1406                  *
1407                  * NOTE: it might seem that we should unique-ify the lefthand input
1408                  * when considering JOIN_REVERSE_IN.  But this is not so, because
1409                  * the join clause we've been handed has not been commuted from
1410                  * the way the parser originally wrote it.      We know that the
1411                  * unique side of the IN clause is *always* on the right.
1412                  *
1413                  * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1414                  * JOIN_RIGHT here, because we do not have enough information to
1415                  * determine which var is really on which side of the join.
1416                  * Perhaps someday we should pass in more information.
1417                  */
1418                 if (jointype == JOIN_IN ||
1419                         jointype == JOIN_REVERSE_IN ||
1420                         jointype == JOIN_UNIQUE_INNER ||
1421                         jointype == JOIN_UNIQUE_OUTER)
1422                 {
1423                         float4          oneovern = 1.0 / nd2;
1424
1425                         for (i = 0; i < nvalues2; i++)
1426                                 numbers2[i] = oneovern;
1427                         nullfrac2 = oneovern;
1428                 }
1429
1430                 /*
1431                  * Note we assume that each MCV will match at most one member of
1432                  * the other MCV list.  If the operator isn't really equality,
1433                  * there could be multiple matches --- but we don't look for them,
1434                  * both for speed and because the math wouldn't add up...
1435                  */
1436                 matchprodfreq = 0.0;
1437                 nmatches = 0;
1438                 for (i = 0; i < nvalues1; i++)
1439                 {
1440                         int                     j;
1441
1442                         for (j = 0; j < nvalues2; j++)
1443                         {
1444                                 if (hasmatch2[j])
1445                                         continue;
1446                                 if (DatumGetBool(FunctionCall2(&eqproc,
1447                                                                                            values1[i],
1448                                                                                            values2[j])))
1449                                 {
1450                                         hasmatch1[i] = hasmatch2[j] = true;
1451                                         matchprodfreq += numbers1[i] * numbers2[j];
1452                                         nmatches++;
1453                                         break;
1454                                 }
1455                         }
1456                 }
1457                 CLAMP_PROBABILITY(matchprodfreq);
1458                 /* Sum up frequencies of matched and unmatched MCVs */
1459                 matchfreq1 = unmatchfreq1 = 0.0;
1460                 for (i = 0; i < nvalues1; i++)
1461                 {
1462                         if (hasmatch1[i])
1463                                 matchfreq1 += numbers1[i];
1464                         else
1465                                 unmatchfreq1 += numbers1[i];
1466                 }
1467                 CLAMP_PROBABILITY(matchfreq1);
1468                 CLAMP_PROBABILITY(unmatchfreq1);
1469                 matchfreq2 = unmatchfreq2 = 0.0;
1470                 for (i = 0; i < nvalues2; i++)
1471                 {
1472                         if (hasmatch2[i])
1473                                 matchfreq2 += numbers2[i];
1474                         else
1475                                 unmatchfreq2 += numbers2[i];
1476                 }
1477                 CLAMP_PROBABILITY(matchfreq2);
1478                 CLAMP_PROBABILITY(unmatchfreq2);
1479                 pfree(hasmatch1);
1480                 pfree(hasmatch2);
1481
1482                 /*
1483                  * Compute total frequency of non-null values that are not in the
1484                  * MCV lists.
1485                  */
1486                 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1487                 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1488                 CLAMP_PROBABILITY(otherfreq1);
1489                 CLAMP_PROBABILITY(otherfreq2);
1490
1491                 /*
1492                  * We can estimate the total selectivity from the point of view of
1493                  * relation 1 as: the known selectivity for matched MCVs, plus
1494                  * unmatched MCVs that are assumed to match against random members
1495                  * of relation 2's non-MCV population, plus non-MCV values that
1496                  * are assumed to match against random members of relation 2's
1497                  * unmatched MCVs plus non-MCV values.
1498                  */
1499                 totalsel1 = matchprodfreq;
1500                 if (nd2 > nvalues2)
1501                         totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1502                 if (nd2 > nmatches)
1503                         totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1504                                 (nd2 - nmatches);
1505                 /* Same estimate from the point of view of relation 2. */
1506                 totalsel2 = matchprodfreq;
1507                 if (nd1 > nvalues1)
1508                         totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1509                 if (nd1 > nmatches)
1510                         totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1511                                 (nd1 - nmatches);
1512
1513                 /*
1514                  * Use the smaller of the two estimates.  This can be justified in
1515                  * essentially the same terms as given below for the no-stats
1516                  * case: to a first approximation, we are estimating from the
1517                  * point of view of the relation with smaller nd.
1518                  */
1519                 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1520         }
1521         else
1522         {
1523                 /*
1524                  * We do not have MCV lists for both sides.  Estimate the join
1525                  * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2).
1526                  * This is plausible if we assume that the join operator is strict
1527                  * and the non-null values are about equally distributed: a given
1528                  * non-null tuple of rel1 will join to either zero or
1529                  * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are at
1530                  * most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
1531                  * selectivity of not more than (1-nullfrac1)*(1-nullfrac2)/nd2.
1532                  * By the same logic it is not more than
1533                  * (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression with MIN()
1534                  * is an upper bound.  Using the MIN() means we estimate from the
1535                  * point of view of the relation with smaller nd (since the larger
1536                  * nd is determining the MIN).  It is reasonable to assume that
1537                  * most tuples in this rel will have join partners, so the bound
1538                  * is probably reasonably tight and should be taken as-is.
1539                  *
1540                  * XXX Can we be smarter if we have an MCV list for just one side? It
1541                  * seems that if we assume equal distribution for the other side,
1542                  * we end up with the same answer anyway.
1543                  */
1544                 double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
1545                 double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
1546
1547                 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
1548                 if (nd1 > nd2)
1549                         selec /= nd1;
1550                 else
1551                         selec /= nd2;
1552         }
1553
1554         if (have_mcvs1)
1555                 free_attstatsslot(vardata1.atttype, values1, nvalues1,
1556                                                   numbers1, nnumbers1);
1557         if (have_mcvs2)
1558                 free_attstatsslot(vardata2.atttype, values2, nvalues2,
1559                                                   numbers2, nnumbers2);
1560
1561         ReleaseVariableStats(vardata1);
1562         ReleaseVariableStats(vardata2);
1563
1564         CLAMP_PROBABILITY(selec);
1565
1566         PG_RETURN_FLOAT8((float8) selec);
1567 }
1568
1569 /*
1570  *              neqjoinsel              - Join selectivity of "!="
1571  */
1572 Datum
1573 neqjoinsel(PG_FUNCTION_ARGS)
1574 {
1575         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1576         Oid                     operator = PG_GETARG_OID(1);
1577         List       *args = (List *) PG_GETARG_POINTER(2);
1578         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1579         Oid                     eqop;
1580         float8          result;
1581
1582         /*
1583          * We want 1 - eqjoinsel() where the equality operator is the one
1584          * associated with this != operator, that is, its negator.
1585          */
1586         eqop = get_negator(operator);
1587         if (eqop)
1588         {
1589                 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
1590                                                                                                         PointerGetDatum(root),
1591                                                                                                   ObjectIdGetDatum(eqop),
1592                                                                                                         PointerGetDatum(args),
1593                                                                                            Int16GetDatum(jointype)));
1594         }
1595         else
1596         {
1597                 /* Use default selectivity (should we raise an error instead?) */
1598                 result = DEFAULT_EQ_SEL;
1599         }
1600         result = 1.0 - result;
1601         PG_RETURN_FLOAT8(result);
1602 }
1603
1604 /*
1605  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
1606  */
1607 Datum
1608 scalarltjoinsel(PG_FUNCTION_ARGS)
1609 {
1610         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1611 }
1612
1613 /*
1614  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
1615  */
1616 Datum
1617 scalargtjoinsel(PG_FUNCTION_ARGS)
1618 {
1619         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1620 }
1621
1622 /*
1623  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
1624  */
1625 Datum
1626 regexeqjoinsel(PG_FUNCTION_ARGS)
1627 {
1628         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1629 }
1630
1631 /*
1632  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
1633  */
1634 Datum
1635 icregexeqjoinsel(PG_FUNCTION_ARGS)
1636 {
1637         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1638 }
1639
1640 /*
1641  *              likejoinsel                     - Join selectivity of LIKE pattern match.
1642  */
1643 Datum
1644 likejoinsel(PG_FUNCTION_ARGS)
1645 {
1646         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1647 }
1648
1649 /*
1650  *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
1651  */
1652 Datum
1653 iclikejoinsel(PG_FUNCTION_ARGS)
1654 {
1655         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1656 }
1657
1658 /*
1659  *              regexnejoinsel  - Join selectivity of regex non-match.
1660  */
1661 Datum
1662 regexnejoinsel(PG_FUNCTION_ARGS)
1663 {
1664         float8          result;
1665
1666         result = DatumGetFloat8(regexeqjoinsel(fcinfo));
1667         result = 1.0 - result;
1668         PG_RETURN_FLOAT8(result);
1669 }
1670
1671 /*
1672  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
1673  */
1674 Datum
1675 icregexnejoinsel(PG_FUNCTION_ARGS)
1676 {
1677         float8          result;
1678
1679         result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
1680         result = 1.0 - result;
1681         PG_RETURN_FLOAT8(result);
1682 }
1683
1684 /*
1685  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
1686  */
1687 Datum
1688 nlikejoinsel(PG_FUNCTION_ARGS)
1689 {
1690         float8          result;
1691
1692         result = DatumGetFloat8(likejoinsel(fcinfo));
1693         result = 1.0 - result;
1694         PG_RETURN_FLOAT8(result);
1695 }
1696
1697 /*
1698  *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
1699  */
1700 Datum
1701 icnlikejoinsel(PG_FUNCTION_ARGS)
1702 {
1703         float8          result;
1704
1705         result = DatumGetFloat8(iclikejoinsel(fcinfo));
1706         result = 1.0 - result;
1707         PG_RETURN_FLOAT8(result);
1708 }
1709
1710 /*
1711  * mergejoinscansel                     - Scan selectivity of merge join.
1712  *
1713  * A merge join will stop as soon as it exhausts either input stream.
1714  * Therefore, if we can estimate the ranges of both input variables,
1715  * we can estimate how much of the input will actually be read.  This
1716  * can have a considerable impact on the cost when using indexscans.
1717  *
1718  * clause should be a clause already known to be mergejoinable.
1719  *
1720  * *leftscan is set to the fraction of the left-hand variable expected
1721  * to be scanned (0 to 1), and similarly *rightscan for the right-hand
1722  * variable.
1723  */
1724 void
1725 mergejoinscansel(PlannerInfo *root, Node *clause,
1726                                  Selectivity *leftscan,
1727                                  Selectivity *rightscan)
1728 {
1729         Node       *left,
1730                            *right;
1731         VariableStatData leftvar,
1732                                 rightvar;
1733         Oid                     lefttype,
1734                                 righttype;
1735         Oid                     opno,
1736                                 lsortop,
1737                                 rsortop,
1738                                 ltop,
1739                                 gtop,
1740                                 leop,
1741                                 revgtop,
1742                                 revleop;
1743         Datum           leftmax,
1744                                 rightmax;
1745         double          selec;
1746
1747         /* Set default results if we can't figure anything out. */
1748         *leftscan = *rightscan = 1.0;
1749
1750         /* Deconstruct the merge clause */
1751         if (!is_opclause(clause))
1752                 return;                                 /* shouldn't happen */
1753         opno = ((OpExpr *) clause)->opno;
1754         left = get_leftop((Expr *) clause);
1755         right = get_rightop((Expr *) clause);
1756         if (!right)
1757                 return;                                 /* shouldn't happen */
1758
1759         /* Look for stats for the inputs */
1760         examine_variable(root, left, 0, &leftvar);
1761         examine_variable(root, right, 0, &rightvar);
1762
1763         /* Get the direct input types of the operator */
1764         lefttype = exprType(left);
1765         righttype = exprType(right);
1766
1767         /* Verify mergejoinability and get left and right "<" operators */
1768         if (!op_mergejoinable(opno,
1769                                                   &lsortop,
1770                                                   &rsortop))
1771                 goto fail;                              /* shouldn't happen */
1772
1773         /* Try to get maximum values of both inputs */
1774         if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
1775                 goto fail;                              /* no max available from stats */
1776
1777         if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
1778                 goto fail;                              /* no max available from stats */
1779
1780         /* Look up the "left < right" and "left > right" operators */
1781         op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);
1782
1783         /* Look up the "left <= right" operator */
1784         leop = get_negator(gtop);
1785         if (!OidIsValid(leop))
1786                 goto fail;                              /* insufficient info in catalogs */
1787
1788         /* Look up the "right > left" operator */
1789         revgtop = get_commutator(ltop);
1790         if (!OidIsValid(revgtop))
1791                 goto fail;                              /* insufficient info in catalogs */
1792
1793         /* Look up the "right <= left" operator */
1794         revleop = get_negator(revgtop);
1795         if (!OidIsValid(revleop))
1796                 goto fail;                              /* insufficient info in catalogs */
1797
1798         /*
1799          * Now, the fraction of the left variable that will be scanned is the
1800          * fraction that's <= the right-side maximum value.  But only believe
1801          * non-default estimates, else stick with our 1.0.
1802          */
1803         selec = scalarineqsel(root, leop, false, &leftvar,
1804                                                   rightmax, righttype);
1805         if (selec != DEFAULT_INEQ_SEL)
1806                 *leftscan = selec;
1807
1808         /* And similarly for the right variable. */
1809         selec = scalarineqsel(root, revleop, false, &rightvar,
1810                                                   leftmax, lefttype);
1811         if (selec != DEFAULT_INEQ_SEL)
1812                 *rightscan = selec;
1813
1814         /*
1815          * Only one of the two fractions can really be less than 1.0; believe
1816          * the smaller estimate and reset the other one to exactly 1.0.  If we
1817          * get exactly equal estimates (as can easily happen with self-joins),
1818          * believe neither.
1819          */
1820         if (*leftscan > *rightscan)
1821                 *leftscan = 1.0;
1822         else if (*leftscan < *rightscan)
1823                 *rightscan = 1.0;
1824         else
1825                 *leftscan = *rightscan = 1.0;
1826
1827 fail:
1828         ReleaseVariableStats(leftvar);
1829         ReleaseVariableStats(rightvar);
1830 }
1831
1832
1833 /*
1834  * Helper routine for estimate_num_groups: add an item to a list of
1835  * GroupVarInfos, but only if it's not known equal to any of the existing
1836  * entries.
1837  */
1838 typedef struct
1839 {
1840         Node       *var;                /* might be an expression, not just a Var */
1841         RelOptInfo *rel;                /* relation it belongs to */
1842         double          ndistinct;      /* # distinct values */
1843 } GroupVarInfo;
1844
1845 static List *
1846 add_unique_group_var(PlannerInfo *root, List *varinfos,
1847                                          Node *var, VariableStatData *vardata)
1848 {
1849         GroupVarInfo *varinfo;
1850         double          ndistinct;
1851         ListCell   *lc;
1852
1853         ndistinct = get_variable_numdistinct(vardata);
1854
1855         /* cannot use foreach here because of possible list_delete */
1856         lc = list_head(varinfos);
1857         while (lc)
1858         {
1859                 varinfo = (GroupVarInfo *) lfirst(lc);
1860
1861                 /* must advance lc before list_delete possibly pfree's it */
1862                 lc = lnext(lc);
1863
1864                 /* Drop exact duplicates */
1865                 if (equal(var, varinfo->var))
1866                         return varinfos;
1867
1868                 /*
1869                  * Drop known-equal vars, but only if they belong to different
1870                  * relations (see comments for estimate_num_groups)
1871                  */
1872                 if (vardata->rel != varinfo->rel &&
1873                         exprs_known_equal(root, var, varinfo->var))
1874                 {
1875                         if (varinfo->ndistinct <= ndistinct)
1876                         {
1877                                 /* Keep older item, forget new one */
1878                                 return varinfos;
1879                         }
1880                         else
1881                         {
1882                                 /* Delete the older item */
1883                                 varinfos = list_delete_ptr(varinfos, varinfo);
1884                         }
1885                 }
1886         }
1887
1888         varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
1889
1890         varinfo->var = var;
1891         varinfo->rel = vardata->rel;
1892         varinfo->ndistinct = ndistinct;
1893         varinfos = lappend(varinfos, varinfo);
1894         return varinfos;
1895 }
1896
1897 /*
1898  * estimate_num_groups          - Estimate number of groups in a grouped query
1899  *
1900  * Given a query having a GROUP BY clause, estimate how many groups there
1901  * will be --- ie, the number of distinct combinations of the GROUP BY
1902  * expressions.
1903  *
1904  * This routine is also used to estimate the number of rows emitted by
1905  * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
1906  * actually, we only use it for DISTINCT when there's no grouping or
1907  * aggregation ahead of the DISTINCT.)
1908  *
1909  * Inputs:
1910  *      root - the query
1911  *      groupExprs - list of expressions being grouped by
1912  *      input_rows - number of rows estimated to arrive at the group/unique
1913  *              filter step
1914  *
1915  * Given the lack of any cross-correlation statistics in the system, it's
1916  * impossible to do anything really trustworthy with GROUP BY conditions
1917  * involving multiple Vars.  We should however avoid assuming the worst
1918  * case (all possible cross-product terms actually appear as groups) since
1919  * very often the grouped-by Vars are highly correlated.  Our current approach
1920  * is as follows:
1921  *      1.      Reduce the given expressions to a list of unique Vars used.  For
1922  *              example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
1923  *              It is clearly correct not to count the same Var more than once.
1924  *              It is also reasonable to treat f(x) the same as x: f() cannot
1925  *              increase the number of distinct values (unless it is volatile,
1926  *              which we consider unlikely for grouping), but it probably won't
1927  *              reduce the number of distinct values much either.
1928  *              As a special case, if a GROUP BY expression can be matched to an
1929  *              expressional index for which we have statistics, then we treat the
1930  *              whole expression as though it were just a Var.
1931  *      2.      If the list contains Vars of different relations that are known equal
1932  *              due to equijoin clauses, then drop all but one of the Vars from each
1933  *              known-equal set, keeping the one with smallest estimated # of values
1934  *              (since the extra values of the others can't appear in joined rows).
1935  *              Note the reason we only consider Vars of different relations is that
1936  *              if we considered ones of the same rel, we'd be double-counting the
1937  *              restriction selectivity of the equality in the next step.
1938  *      3.      For Vars within a single source rel, we multiply together the numbers
1939  *              of values, clamp to the number of rows in the rel (divided by 10 if
1940  *              more than one Var), and then multiply by the selectivity of the
1941  *              restriction clauses for that rel.  When there's more than one Var,
1942  *              the initial product is probably too high (it's the worst case) but
1943  *              clamping to a fraction of the rel's rows seems to be a helpful
1944  *              heuristic for not letting the estimate get out of hand.  (The factor
1945  *              of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
1946  *              by the restriction selectivity is effectively assuming that the
1947  *              restriction clauses are independent of the grouping, which is a crummy
1948  *              assumption, but it's hard to do better.
1949  *      4.      If there are Vars from multiple rels, we repeat step 3 for each such
1950  *              rel, and multiply the results together.
1951  * Note that rels not containing grouped Vars are ignored completely, as are
1952  * join clauses other than the equijoin clauses used in step 2.  Such rels
1953  * cannot increase the number of groups, and we assume such clauses do not
1954  * reduce the number either (somewhat bogus, but we don't have the info to
1955  * do better).
1956  */
1957 double
1958 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
1959 {
1960         List       *varinfos = NIL;
1961         double          numdistinct;
1962         ListCell   *l;
1963
1964         /* We should not be called unless query has GROUP BY (or DISTINCT) */
1965         Assert(groupExprs != NIL);
1966
1967         /*
1968          * Steps 1/2: find the unique Vars used, treating an expression as a Var
1969          * if we can find stats for it.  For each one, record the statistical
1970          * estimate of number of distinct values (total in its table, without
1971          * regard for filtering).
1972          */
1973         foreach(l, groupExprs)
1974         {
1975                 Node       *groupexpr = (Node *) lfirst(l);
1976                 VariableStatData vardata;
1977                 List       *varshere;
1978                 ListCell   *l2;
1979
1980                 /*
1981                  * If examine_variable is able to deduce anything about the GROUP BY
1982                  * expression, treat it as a single variable even if it's really more
1983                  * complicated.
1984                  */
1985                 examine_variable(root, groupexpr, 0, &vardata);
1986                 if (vardata.statsTuple != NULL || vardata.isunique)
1987                 {
1988                         varinfos = add_unique_group_var(root, varinfos,
1989                                                                                         groupexpr, &vardata);
1990                         ReleaseVariableStats(vardata);
1991                         continue;
1992                 }
1993                 ReleaseVariableStats(vardata);
1994
1995                 /*
1996                  * Else pull out the component Vars
1997                  */
1998                 varshere = pull_var_clause(groupexpr, false);
1999
2000                 /*
2001                  * If we find any variable-free GROUP BY item, then either it is a
2002                  * constant (and we can ignore it) or it contains a volatile
2003                  * function; in the latter case we punt and assume that each input
2004                  * row will yield a distinct group.
2005                  */
2006                 if (varshere == NIL)
2007                 {
2008                         if (contain_volatile_functions(groupexpr))
2009                                 return input_rows;
2010                         continue;
2011                 }
2012
2013                 /*
2014                  * Else add variables to varinfos list
2015                  */
2016                 foreach(l2, varshere)
2017                 {
2018                         Node       *var = (Node *) lfirst(l2);
2019
2020                         examine_variable(root, var, 0, &vardata);
2021                         varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2022                         ReleaseVariableStats(vardata);
2023                 }
2024         }
2025
2026         /* If now no Vars, we must have an all-constant GROUP BY list. */
2027         if (varinfos == NIL)
2028                 return 1.0;
2029
2030         /*
2031          * Steps 3/4: group Vars by relation and estimate total numdistinct.
2032          *
2033          * For each iteration of the outer loop, we process the frontmost Var in
2034          * varinfos, plus all other Vars in the same relation.  We remove
2035          * these Vars from the newvarinfos list for the next iteration. This
2036          * is the easiest way to group Vars of same rel together.
2037          */
2038         numdistinct = 1.0;
2039
2040         do
2041         {
2042                 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2043                 RelOptInfo *rel = varinfo1->rel;
2044                 double          reldistinct = varinfo1->ndistinct;
2045                 double          relmaxndistinct = reldistinct;
2046                 int                     relvarcount = 1;
2047                 List       *newvarinfos = NIL;
2048
2049                 /*
2050                  * Get the product of numdistinct estimates of the Vars for this rel.
2051                  * Also, construct new varinfos list of remaining Vars.
2052                  */
2053                 for_each_cell(l, lnext(list_head(varinfos)))
2054                 {
2055                         GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2056
2057                         if (varinfo2->rel == varinfo1->rel)
2058                         {
2059                                 reldistinct *= varinfo2->ndistinct;
2060                                 if (relmaxndistinct < varinfo2->ndistinct)
2061                                         relmaxndistinct = varinfo2->ndistinct;
2062                                 relvarcount++;
2063                         }
2064                         else
2065                         {
2066                                 /* not time to process varinfo2 yet */
2067                                 newvarinfos = lcons(varinfo2, newvarinfos);
2068                         }
2069                 }
2070
2071                 /*
2072                  * Sanity check --- don't divide by zero if empty relation.
2073                  */
2074                 Assert(rel->reloptkind == RELOPT_BASEREL);
2075                 if (rel->tuples > 0)
2076                 {
2077                         /*
2078                          * Clamp to size of rel, or size of rel / 10 if multiple Vars.
2079                          * The fudge factor is because the Vars are probably correlated
2080                          * but we don't know by how much.  We should never clamp to less
2081                          * than the largest ndistinct value for any of the Vars, though,
2082                          * since there will surely be at least that many groups.
2083                          */
2084                         double          clamp = rel->tuples;
2085
2086                         if (relvarcount > 1)
2087                         {
2088                                 clamp *= 0.1;
2089                                 if (clamp < relmaxndistinct)
2090                                 {
2091                                         clamp = relmaxndistinct;
2092                                         /* for sanity in case some ndistinct is too large: */
2093                                         if (clamp > rel->tuples)
2094                                                 clamp = rel->tuples;
2095                                 }
2096                         }
2097                         if (reldistinct > clamp)
2098                                 reldistinct = clamp;
2099
2100                         /*
2101                          * Multiply by restriction selectivity.
2102                          */
2103                         reldistinct *= rel->rows / rel->tuples;
2104
2105                         /*
2106                          * Update estimate of total distinct groups.
2107                          */
2108                         numdistinct *= reldistinct;
2109                 }
2110
2111                 varinfos = newvarinfos;
2112         } while (varinfos != NIL);
2113
2114         numdistinct = ceil(numdistinct);
2115
2116         /* Guard against out-of-range answers */
2117         if (numdistinct > input_rows)
2118                 numdistinct = input_rows;
2119         if (numdistinct < 1.0)
2120                 numdistinct = 1.0;
2121
2122         return numdistinct;
2123 }
2124
2125 /*
2126  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2127  * divided by total tuples in relation) if the specified expression is used
2128  * as a hash key.
2129  *
2130  * XXX This is really pretty bogus since we're effectively assuming that the
2131  * distribution of hash keys will be the same after applying restriction
2132  * clauses as it was in the underlying relation.  However, we are not nearly
2133  * smart enough to figure out how the restrict clauses might change the
2134  * distribution, so this will have to do for now.
2135  *
2136  * We are passed the number of buckets the executor will use for the given
2137  * input relation.      If the data were perfectly distributed, with the same
2138  * number of tuples going into each available bucket, then the bucketsize
2139  * fraction would be 1/nbuckets.  But this happy state of affairs will occur
2140  * only if (a) there are at least nbuckets distinct data values, and (b)
2141  * we have a not-too-skewed data distribution.  Otherwise the buckets will
2142  * be nonuniformly occupied.  If the other relation in the join has a key
2143  * distribution similar to this one's, then the most-loaded buckets are
2144  * exactly those that will be probed most often.  Therefore, the "average"
2145  * bucket size for costing purposes should really be taken as something close
2146  * to the "worst case" bucket size.  We try to estimate this by adjusting the
2147  * fraction if there are too few distinct data values, and then scaling up
2148  * by the ratio of the most common value's frequency to the average frequency.
2149  *
2150  * If no statistics are available, use a default estimate of 0.1.  This will
2151  * discourage use of a hash rather strongly if the inner relation is large,
2152  * which is what we want.  We do not want to hash unless we know that the
2153  * inner rel is well-dispersed (or the alternatives seem much worse).
2154  */
2155 Selectivity
2156 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2157 {
2158         VariableStatData vardata;
2159         double          estfract,
2160                                 ndistinct,
2161                                 stanullfrac,
2162                                 mcvfreq,
2163                                 avgfreq;
2164         float4     *numbers;
2165         int                     nnumbers;
2166
2167         examine_variable(root, hashkey, 0, &vardata);
2168
2169         /* Get number of distinct values and fraction that are null */
2170         ndistinct = get_variable_numdistinct(&vardata);
2171
2172         if (HeapTupleIsValid(vardata.statsTuple))
2173         {
2174                 Form_pg_statistic stats;
2175
2176                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2177                 stanullfrac = stats->stanullfrac;
2178         }
2179         else
2180         {
2181                 /*
2182                  * Believe a default ndistinct only if it came from stats.
2183                  * Otherwise punt and return 0.1, per comments above.
2184                  */
2185                 if (ndistinct == DEFAULT_NUM_DISTINCT)
2186                 {
2187                         ReleaseVariableStats(vardata);
2188                         return (Selectivity) 0.1;
2189                 }
2190
2191                 stanullfrac = 0.0;
2192         }
2193
2194         /* Compute avg freq of all distinct data values in raw relation */
2195         avgfreq = (1.0 - stanullfrac) / ndistinct;
2196
2197         /*
2198          * Adjust ndistinct to account for restriction clauses.  Observe we
2199          * are assuming that the data distribution is affected uniformly by
2200          * the restriction clauses!
2201          *
2202          * XXX Possibly better way, but much more expensive: multiply by
2203          * selectivity of rel's restriction clauses that mention the target
2204          * Var.
2205          */
2206         if (vardata.rel)
2207                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2208
2209         /*
2210          * Initial estimate of bucketsize fraction is 1/nbuckets as long as
2211          * the number of buckets is less than the expected number of distinct
2212          * values; otherwise it is 1/ndistinct.
2213          */
2214         if (ndistinct > nbuckets)
2215                 estfract = 1.0 / nbuckets;
2216         else
2217                 estfract = 1.0 / ndistinct;
2218
2219         /*
2220          * Look up the frequency of the most common value, if available.
2221          */
2222         mcvfreq = 0.0;
2223
2224         if (HeapTupleIsValid(vardata.statsTuple))
2225         {
2226                 if (get_attstatsslot(vardata.statsTuple,
2227                                                          vardata.atttype, vardata.atttypmod,
2228                                                          STATISTIC_KIND_MCV, InvalidOid,
2229                                                          NULL, NULL, &numbers, &nnumbers))
2230                 {
2231                         /*
2232                          * The first MCV stat is for the most common value.
2233                          */
2234                         if (nnumbers > 0)
2235                                 mcvfreq = numbers[0];
2236                         free_attstatsslot(vardata.atttype, NULL, 0,
2237                                                           numbers, nnumbers);
2238                 }
2239         }
2240
2241         /*
2242          * Adjust estimated bucketsize upward to account for skewed
2243          * distribution.
2244          */
2245         if (avgfreq > 0.0 && mcvfreq > avgfreq)
2246                 estfract *= mcvfreq / avgfreq;
2247
2248         /*
2249          * Clamp bucketsize to sane range (the above adjustment could easily
2250          * produce an out-of-range result).  We set the lower bound a little
2251          * above zero, since zero isn't a very sane result.
2252          */
2253         if (estfract < 1.0e-6)
2254                 estfract = 1.0e-6;
2255         else if (estfract > 1.0)
2256                 estfract = 1.0;
2257
2258         ReleaseVariableStats(vardata);
2259
2260         return (Selectivity) estfract;
2261 }
2262
2263
2264 /*-------------------------------------------------------------------------
2265  *
2266  * Support routines
2267  *
2268  *-------------------------------------------------------------------------
2269  */
2270
2271 /*
2272  * convert_to_scalar
2273  *        Convert non-NULL values of the indicated types to the comparison
2274  *        scale needed by scalarltsel()/scalargtsel().
2275  *        Returns "true" if successful.
2276  *
2277  * XXX this routine is a hack: ideally we should look up the conversion
2278  * subroutines in pg_type.
2279  *
2280  * All numeric datatypes are simply converted to their equivalent
2281  * "double" values.  (NUMERIC values that are outside the range of "double"
2282  * are clamped to +/- HUGE_VAL.)
2283  *
2284  * String datatypes are converted by convert_string_to_scalar(),
2285  * which is explained below.  The reason why this routine deals with
2286  * three values at a time, not just one, is that we need it for strings.
2287  *
2288  * The bytea datatype is just enough different from strings that it has
2289  * to be treated separately.
2290  *
2291  * The several datatypes representing absolute times are all converted
2292  * to Timestamp, which is actually a double, and then we just use that
2293  * double value.  Note this will give correct results even for the "special"
2294  * values of Timestamp, since those are chosen to compare correctly;
2295  * see timestamp_cmp.
2296  *
2297  * The several datatypes representing relative times (intervals) are all
2298  * converted to measurements expressed in seconds.
2299  */
2300 static bool
2301 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2302                                   Datum lobound, Datum hibound, Oid boundstypid,
2303                                   double *scaledlobound, double *scaledhibound)
2304 {
2305         /*
2306          * Both the valuetypid and the boundstypid should exactly match
2307          * the declared input type(s) of the operator we are invoked for,
2308          * so we just error out if either is not recognized.
2309          *
2310          * XXX The histogram we are interpolating between points of could belong
2311          * to a column that's only binary-compatible with the declared type.
2312          * In essence we are assuming that the semantics of binary-compatible
2313          * types are enough alike that we can use a histogram generated with one
2314          * type's operators to estimate selectivity for the other's.  This is
2315          * outright wrong in some cases --- in particular signed versus unsigned
2316          * interpretation could trip us up.  But it's useful enough in the
2317          * majority of cases that we do it anyway.  Should think about more
2318          * rigorous ways to do it.
2319          */
2320         switch (valuetypid)
2321         {
2322                         /*
2323                          * Built-in numeric types
2324                          */
2325                 case BOOLOID:
2326                 case INT2OID:
2327                 case INT4OID:
2328                 case INT8OID:
2329                 case FLOAT4OID:
2330                 case FLOAT8OID:
2331                 case NUMERICOID:
2332                 case OIDOID:
2333                 case REGPROCOID:
2334                 case REGPROCEDUREOID:
2335                 case REGOPEROID:
2336                 case REGOPERATOROID:
2337                 case REGCLASSOID:
2338                 case REGTYPEOID:
2339                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2340                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2341                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2342                         return true;
2343
2344                         /*
2345                          * Built-in string types
2346                          */
2347                 case CHAROID:
2348                 case BPCHAROID:
2349                 case VARCHAROID:
2350                 case TEXTOID:
2351                 case NAMEOID:
2352                         {
2353                                 unsigned char *valstr = convert_string_datum(value, valuetypid);
2354                                 unsigned char *lostr = convert_string_datum(lobound, boundstypid);
2355                                 unsigned char *histr = convert_string_datum(hibound, boundstypid);
2356
2357                                 convert_string_to_scalar(valstr, scaledvalue,
2358                                                                                  lostr, scaledlobound,
2359                                                                                  histr, scaledhibound);
2360                                 pfree(valstr);
2361                                 pfree(lostr);
2362                                 pfree(histr);
2363                                 return true;
2364                         }
2365
2366                         /*
2367                          * Built-in bytea type
2368                          */
2369                 case BYTEAOID:
2370                         {
2371                                 convert_bytea_to_scalar(value, scaledvalue,
2372                                                                                 lobound, scaledlobound,
2373                                                                                 hibound, scaledhibound);
2374                                 return true;
2375                         }
2376
2377                         /*
2378                          * Built-in time types
2379                          */
2380                 case TIMESTAMPOID:
2381                 case TIMESTAMPTZOID:
2382                 case ABSTIMEOID:
2383                 case DATEOID:
2384                 case INTERVALOID:
2385                 case RELTIMEOID:
2386                 case TINTERVALOID:
2387                 case TIMEOID:
2388                 case TIMETZOID:
2389                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2390                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2391                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2392                         return true;
2393
2394                         /*
2395                          * Built-in network types
2396                          */
2397                 case INETOID:
2398                 case CIDROID:
2399                 case MACADDROID:
2400                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
2401                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2402                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2403                         return true;
2404         }
2405         /* Don't know how to convert */
2406         return false;
2407 }
2408
2409 /*
2410  * Do convert_to_scalar()'s work for any numeric data type.
2411  */
2412 static double
2413 convert_numeric_to_scalar(Datum value, Oid typid)
2414 {
2415         switch (typid)
2416         {
2417                 case BOOLOID:
2418                         return (double) DatumGetBool(value);
2419                 case INT2OID:
2420                         return (double) DatumGetInt16(value);
2421                 case INT4OID:
2422                         return (double) DatumGetInt32(value);
2423                 case INT8OID:
2424                         return (double) DatumGetInt64(value);
2425                 case FLOAT4OID:
2426                         return (double) DatumGetFloat4(value);
2427                 case FLOAT8OID:
2428                         return (double) DatumGetFloat8(value);
2429                 case NUMERICOID:
2430                         /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2431                         return (double)
2432                                 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2433                                                                                                    value));
2434                 case OIDOID:
2435                 case REGPROCOID:
2436                 case REGPROCEDUREOID:
2437                 case REGOPEROID:
2438                 case REGOPERATOROID:
2439                 case REGCLASSOID:
2440                 case REGTYPEOID:
2441                         /* we can treat OIDs as integers... */
2442                         return (double) DatumGetObjectId(value);
2443         }
2444
2445         /*
2446          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2447          * on an operator with one numeric and one non-numeric operand.
2448          */
2449         elog(ERROR, "unsupported type: %u", typid);
2450         return 0;
2451 }
2452
2453 /*
2454  * Do convert_to_scalar()'s work for any character-string data type.
2455  *
2456  * String datatypes are converted to a scale that ranges from 0 to 1,
2457  * where we visualize the bytes of the string as fractional digits.
2458  *
2459  * We do not want the base to be 256, however, since that tends to
2460  * generate inflated selectivity estimates; few databases will have
2461  * occurrences of all 256 possible byte values at each position.
2462  * Instead, use the smallest and largest byte values seen in the bounds
2463  * as the estimated range for each byte, after some fudging to deal with
2464  * the fact that we probably aren't going to see the full range that way.
2465  *
2466  * An additional refinement is that we discard any common prefix of the
2467  * three strings before computing the scaled values.  This allows us to
2468  * "zoom in" when we encounter a narrow data range.  An example is a phone
2469  * number database where all the values begin with the same area code.
2470  * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2471  * so this is more likely to happen than you might think.)
2472  */
2473 static void
2474 convert_string_to_scalar(unsigned char *value,
2475                                                  double *scaledvalue,
2476                                                  unsigned char *lobound,
2477                                                  double *scaledlobound,
2478                                                  unsigned char *hibound,
2479                                                  double *scaledhibound)
2480 {
2481         int                     rangelo,
2482                                 rangehi;
2483         unsigned char *sptr;
2484
2485         rangelo = rangehi = hibound[0];
2486         for (sptr = lobound; *sptr; sptr++)
2487         {
2488                 if (rangelo > *sptr)
2489                         rangelo = *sptr;
2490                 if (rangehi < *sptr)
2491                         rangehi = *sptr;
2492         }
2493         for (sptr = hibound; *sptr; sptr++)
2494         {
2495                 if (rangelo > *sptr)
2496                         rangelo = *sptr;
2497                 if (rangehi < *sptr)
2498                         rangehi = *sptr;
2499         }
2500         /* If range includes any upper-case ASCII chars, make it include all */
2501         if (rangelo <= 'Z' && rangehi >= 'A')
2502         {
2503                 if (rangelo > 'A')
2504                         rangelo = 'A';
2505                 if (rangehi < 'Z')
2506                         rangehi = 'Z';
2507         }
2508         /* Ditto lower-case */
2509         if (rangelo <= 'z' && rangehi >= 'a')
2510         {
2511                 if (rangelo > 'a')
2512                         rangelo = 'a';
2513                 if (rangehi < 'z')
2514                         rangehi = 'z';
2515         }
2516         /* Ditto digits */
2517         if (rangelo <= '9' && rangehi >= '0')
2518         {
2519                 if (rangelo > '0')
2520                         rangelo = '0';
2521                 if (rangehi < '9')
2522                         rangehi = '9';
2523         }
2524
2525         /*
2526          * If range includes less than 10 chars, assume we have not got enough
2527          * data, and make it include regular ASCII set.
2528          */
2529         if (rangehi - rangelo < 9)
2530         {
2531                 rangelo = ' ';
2532                 rangehi = 127;
2533         }
2534
2535         /*
2536          * Now strip any common prefix of the three strings.
2537          */
2538         while (*lobound)
2539         {
2540                 if (*lobound != *hibound || *lobound != *value)
2541                         break;
2542                 lobound++, hibound++, value++;
2543         }
2544
2545         /*
2546          * Now we can do the conversions.
2547          */
2548         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2549         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2550         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2551 }
2552
2553 static double
2554 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
2555 {
2556         int                     slen = strlen((char *) value);
2557         double          num,
2558                                 denom,
2559                                 base;
2560
2561         if (slen <= 0)
2562                 return 0.0;                             /* empty string has scalar value 0 */
2563
2564         /*
2565          * Since base is at least 10, need not consider more than about 20
2566          * chars
2567          */
2568         if (slen > 20)
2569                 slen = 20;
2570
2571         /* Convert initial characters to fraction */
2572         base = rangehi - rangelo + 1;
2573         num = 0.0;
2574         denom = base;
2575         while (slen-- > 0)
2576         {
2577                 int                     ch = *value++;
2578
2579                 if (ch < rangelo)
2580                         ch = rangelo - 1;
2581                 else if (ch > rangehi)
2582                         ch = rangehi + 1;
2583                 num += ((double) (ch - rangelo)) / denom;
2584                 denom *= base;
2585         }
2586
2587         return num;
2588 }
2589
2590 /*
2591  * Convert a string-type Datum into a palloc'd, null-terminated string.
2592  *
2593  * When using a non-C locale, we must pass the string through strxfrm()
2594  * before continuing, so as to generate correct locale-specific results.
2595  */
2596 static unsigned char *
2597 convert_string_datum(Datum value, Oid typid)
2598 {
2599         char       *val;
2600
2601         switch (typid)
2602         {
2603                 case CHAROID:
2604                         val = (char *) palloc(2);
2605                         val[0] = DatumGetChar(value);
2606                         val[1] = '\0';
2607                         break;
2608                 case BPCHAROID:
2609                 case VARCHAROID:
2610                 case TEXTOID:
2611                         {
2612                                 char       *str = (char *) VARDATA(DatumGetPointer(value));
2613                                 int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
2614
2615                                 val = (char *) palloc(strlength + 1);
2616                                 memcpy(val, str, strlength);
2617                                 val[strlength] = '\0';
2618                                 break;
2619                         }
2620                 case NAMEOID:
2621                         {
2622                                 NameData   *nm = (NameData *) DatumGetPointer(value);
2623
2624                                 val = pstrdup(NameStr(*nm));
2625                                 break;
2626                         }
2627                 default:
2628
2629                         /*
2630                          * Can't get here unless someone tries to use scalarltsel on
2631                          * an operator with one string and one non-string operand.
2632                          */
2633                         elog(ERROR, "unsupported type: %u", typid);
2634                         return NULL;
2635         }
2636
2637         if (!lc_collate_is_c())
2638         {
2639                 char       *xfrmstr;
2640                 size_t          xfrmlen;
2641                 size_t          xfrmlen2;
2642
2643                 /*
2644                  * Note: originally we guessed at a suitable output buffer size,
2645                  * and only needed to call strxfrm twice if our guess was too
2646                  * small. However, it seems that some versions of Solaris have
2647                  * buggy strxfrm that can write past the specified buffer length
2648                  * in that scenario.  So, do it the dumb way for portability.
2649                  *
2650                  * Yet other systems (e.g., glibc) sometimes return a smaller value
2651                  * from the second call than the first; thus the Assert must be <=
2652                  * not == as you'd expect.  Can't any of these people program
2653                  * their way out of a paper bag?
2654                  */
2655                 xfrmlen = strxfrm(NULL, val, 0);
2656                 xfrmstr = (char *) palloc(xfrmlen + 1);
2657                 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
2658                 Assert(xfrmlen2 <= xfrmlen);
2659                 pfree(val);
2660                 val = xfrmstr;
2661         }
2662
2663         return (unsigned char *) val;
2664 }
2665
2666 /*
2667  * Do convert_to_scalar()'s work for any bytea data type.
2668  *
2669  * Very similar to convert_string_to_scalar except we can't assume
2670  * null-termination and therefore pass explicit lengths around.
2671  *
2672  * Also, assumptions about likely "normal" ranges of characters have been
2673  * removed - a data range of 0..255 is always used, for now.  (Perhaps
2674  * someday we will add information about actual byte data range to
2675  * pg_statistic.)
2676  */
2677 static void
2678 convert_bytea_to_scalar(Datum value,
2679                                                 double *scaledvalue,
2680                                                 Datum lobound,
2681                                                 double *scaledlobound,
2682                                                 Datum hibound,
2683                                                 double *scaledhibound)
2684 {
2685         int                     rangelo,
2686                                 rangehi,
2687                                 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
2688                                 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
2689                                 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
2690                                 i,
2691                                 minlen;
2692         unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
2693                            *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
2694                            *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
2695
2696         /*
2697          * Assume bytea data is uniformly distributed across all byte values.
2698          */
2699         rangelo = 0;
2700         rangehi = 255;
2701
2702         /*
2703          * Now strip any common prefix of the three strings.
2704          */
2705         minlen = Min(Min(valuelen, loboundlen), hiboundlen);
2706         for (i = 0; i < minlen; i++)
2707         {
2708                 if (*lostr != *histr || *lostr != *valstr)
2709                         break;
2710                 lostr++, histr++, valstr++;
2711                 loboundlen--, hiboundlen--, valuelen--;
2712         }
2713
2714         /*
2715          * Now we can do the conversions.
2716          */
2717         *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
2718         *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
2719         *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
2720 }
2721
2722 static double
2723 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
2724                                                         int rangelo, int rangehi)
2725 {
2726         double          num,
2727                                 denom,
2728                                 base;
2729
2730         if (valuelen <= 0)
2731                 return 0.0;                             /* empty string has scalar value 0 */
2732
2733         /*
2734          * Since base is 256, need not consider more than about 10 chars (even
2735          * this many seems like overkill)
2736          */
2737         if (valuelen > 10)
2738                 valuelen = 10;
2739
2740         /* Convert initial characters to fraction */
2741         base = rangehi - rangelo + 1;
2742         num = 0.0;
2743         denom = base;
2744         while (valuelen-- > 0)
2745         {
2746                 int                     ch = *value++;
2747
2748                 if (ch < rangelo)
2749                         ch = rangelo - 1;
2750                 else if (ch > rangehi)
2751                         ch = rangehi + 1;
2752                 num += ((double) (ch - rangelo)) / denom;
2753                 denom *= base;
2754         }
2755
2756         return num;
2757 }
2758
2759 /*
2760  * Do convert_to_scalar()'s work for any timevalue data type.
2761  */
2762 static double
2763 convert_timevalue_to_scalar(Datum value, Oid typid)
2764 {
2765         switch (typid)
2766         {
2767                 case TIMESTAMPOID:
2768                         return DatumGetTimestamp(value);
2769                 case TIMESTAMPTZOID:
2770                         return DatumGetTimestampTz(value);
2771                 case ABSTIMEOID:
2772                         return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
2773                                                                                                                  value));
2774                 case DATEOID:
2775                         return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
2776                                                                                                                  value));
2777                 case INTERVALOID:
2778                         {
2779                                 Interval   *interval = DatumGetIntervalP(value);
2780
2781                                 /*
2782                                  * Convert the month part of Interval to days using
2783                                  * assumed average month length of 365.25/12.0 days.  Not
2784                                  * too accurate, but plenty good enough for our purposes.
2785                                  */
2786 #ifdef HAVE_INT64_TIMESTAMP
2787                                 return interval->time + interval->day * (double)USECS_PER_DAY +
2788                                            interval->month * ((DAYS_PER_YEAR / (double)MONTHS_PER_YEAR) * USECS_PER_DAY);
2789 #else
2790                                 return interval->time + interval->day * SECS_PER_DAY +
2791                                                 interval->month * ((DAYS_PER_YEAR / (double)MONTHS_PER_YEAR) * (double)SECS_PER_DAY);
2792 #endif
2793                         }
2794                 case RELTIMEOID:
2795 #ifdef HAVE_INT64_TIMESTAMP
2796                         return (DatumGetRelativeTime(value) * 1000000.0);
2797 #else
2798                         return DatumGetRelativeTime(value);
2799 #endif
2800                 case TINTERVALOID:
2801                         {
2802                                 TimeInterval tinterval = DatumGetTimeInterval(value);
2803
2804 #ifdef HAVE_INT64_TIMESTAMP
2805                                 if (tinterval->status != 0)
2806                                         return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
2807 #else
2808                                 if (tinterval->status != 0)
2809                                         return tinterval->data[1] - tinterval->data[0];
2810 #endif
2811                                 return 0;               /* for lack of a better idea */
2812                         }
2813                 case TIMEOID:
2814                         return DatumGetTimeADT(value);
2815                 case TIMETZOID:
2816                         {
2817                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
2818
2819                                 /* use GMT-equivalent time */
2820 #ifdef HAVE_INT64_TIMESTAMP
2821                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
2822 #else
2823                                 return (double) (timetz->time + timetz->zone);
2824 #endif
2825                         }
2826         }
2827
2828         /*
2829          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2830          * on an operator with one timevalue and one non-timevalue operand.
2831          */
2832         elog(ERROR, "unsupported type: %u", typid);
2833         return 0;
2834 }
2835
2836
2837 /*
2838  * get_restriction_variable
2839  *              Examine the args of a restriction clause to see if it's of the
2840  *              form (variable op pseudoconstant) or (pseudoconstant op variable),
2841  *              where "variable" could be either a Var or an expression in vars of a
2842  *              single relation.  If so, extract information about the variable,
2843  *              and also indicate which side it was on and the other argument.
2844  *
2845  * Inputs:
2846  *      root: the planner info
2847  *      args: clause argument list
2848  *      varRelid: see specs for restriction selectivity functions
2849  *
2850  * Outputs: (these are valid only if TRUE is returned)
2851  *      *vardata: gets information about variable (see examine_variable)
2852  *      *other: gets other clause argument, aggressively reduced to a constant
2853  *      *varonleft: set TRUE if variable is on the left, FALSE if on the right
2854  *
2855  * Returns TRUE if a variable is identified, otherwise FALSE.
2856  *
2857  * Note: if there are Vars on both sides of the clause, we must fail, because
2858  * callers are expecting that the other side will act like a pseudoconstant.
2859  */
2860 static bool
2861 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
2862                                                  VariableStatData *vardata, Node **other,
2863                                                  bool *varonleft)
2864 {
2865         Node       *left,
2866                            *right;
2867         VariableStatData rdata;
2868
2869         /* Fail if not a binary opclause (probably shouldn't happen) */
2870         if (list_length(args) != 2)
2871                 return false;
2872
2873         left = (Node *) linitial(args);
2874         right = (Node *) lsecond(args);
2875
2876         /*
2877          * Examine both sides.  Note that when varRelid is nonzero, Vars of
2878          * other relations will be treated as pseudoconstants.
2879          */
2880         examine_variable(root, left, varRelid, vardata);
2881         examine_variable(root, right, varRelid, &rdata);
2882
2883         /*
2884          * If one side is a variable and the other not, we win.
2885          */
2886         if (vardata->rel && rdata.rel == NULL)
2887         {
2888                 *varonleft = true;
2889                 *other = estimate_expression_value(rdata.var);
2890                 /* Assume we need no ReleaseVariableStats(rdata) here */
2891                 return true;
2892         }
2893
2894         if (vardata->rel == NULL && rdata.rel)
2895         {
2896                 *varonleft = false;
2897                 *other = estimate_expression_value(vardata->var);
2898                 /* Assume we need no ReleaseVariableStats(*vardata) here */
2899                 *vardata = rdata;
2900                 return true;
2901         }
2902
2903         /* Ooops, clause has wrong structure (probably var op var) */
2904         ReleaseVariableStats(*vardata);
2905         ReleaseVariableStats(rdata);
2906
2907         return false;
2908 }
2909
2910 /*
2911  * get_join_variables
2912  *              Apply examine_variable() to each side of a join clause.
2913  */
2914 static void
2915 get_join_variables(PlannerInfo *root, List *args,
2916                                    VariableStatData *vardata1, VariableStatData *vardata2)
2917 {
2918         Node       *left,
2919                            *right;
2920
2921         if (list_length(args) != 2)
2922                 elog(ERROR, "join operator should take two arguments");
2923
2924         left = (Node *) linitial(args);
2925         right = (Node *) lsecond(args);
2926
2927         examine_variable(root, left, 0, vardata1);
2928         examine_variable(root, right, 0, vardata2);
2929 }
2930
2931 /*
2932  * examine_variable
2933  *              Try to look up statistical data about an expression.
2934  *              Fill in a VariableStatData struct to describe the expression.
2935  *
2936  * Inputs:
2937  *      root: the planner info
2938  *      node: the expression tree to examine
2939  *      varRelid: see specs for restriction selectivity functions
2940  *
2941  * Outputs: *vardata is filled as follows:
2942  *      var: the input expression (with any binary relabeling stripped, if
2943  *              it is or contains a variable; but otherwise the type is preserved)
2944  *      rel: RelOptInfo for relation containing variable; NULL if expression
2945  *              contains no Vars (NOTE this could point to a RelOptInfo of a
2946  *              subquery, not one in the current query).
2947  *      statsTuple: the pg_statistic entry for the variable, if one exists;
2948  *              otherwise NULL.
2949  *      vartype: exposed type of the expression; this should always match
2950  *              the declared input type of the operator we are estimating for.
2951  *      atttype, atttypmod: type data to pass to get_attstatsslot().  This is
2952  *              commonly the same as the exposed type of the variable argument,
2953  *              but can be different in binary-compatible-type cases.
2954  *
2955  * Caller is responsible for doing ReleaseVariableStats() before exiting.
2956  */
2957 static void
2958 examine_variable(PlannerInfo *root, Node *node, int varRelid,
2959                                  VariableStatData *vardata)
2960 {
2961         Node       *basenode;
2962         Relids          varnos;
2963         RelOptInfo *onerel;
2964
2965         /* Make sure we don't return dangling pointers in vardata */
2966         MemSet(vardata, 0, sizeof(VariableStatData));
2967
2968         /* Save the exposed type of the expression */
2969         vardata->vartype = exprType(node);
2970
2971         /* Look inside any binary-compatible relabeling */
2972
2973         if (IsA(node, RelabelType))
2974                 basenode = (Node *) ((RelabelType *) node)->arg;
2975         else
2976                 basenode = node;
2977
2978         /* Fast path for a simple Var */
2979
2980         if (IsA(basenode, Var) &&
2981                 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
2982         {
2983                 Var                *var = (Var *) basenode;
2984                 Oid                     relid;
2985
2986                 vardata->var = basenode;        /* return Var without relabeling */
2987                 vardata->rel = find_base_rel(root, var->varno);
2988                 vardata->atttype = var->vartype;
2989                 vardata->atttypmod = var->vartypmod;
2990
2991                 relid = getrelid(var->varno, root->parse->rtable);
2992
2993                 if (OidIsValid(relid))
2994                 {
2995                         vardata->statsTuple = SearchSysCache(STATRELATT,
2996                                                                                                  ObjectIdGetDatum(relid),
2997                                                                                         Int16GetDatum(var->varattno),
2998                                                                                                  0, 0);
2999                 }
3000                 else
3001                 {
3002                         /*
3003                          * XXX This means the Var comes from a JOIN or sub-SELECT.
3004                          * Later add code to dig down into the join etc and see if we
3005                          * can trace the variable to something with stats.      (But
3006                          * beware of sub-SELECTs with DISTINCT/GROUP BY/etc.  Perhaps
3007                          * there are no cases where this would really be useful,
3008                          * because we'd have flattened the subselect if it is??)
3009                          */
3010                 }
3011
3012                 return;
3013         }
3014
3015         /*
3016          * Okay, it's a more complicated expression.  Determine variable
3017          * membership.  Note that when varRelid isn't zero, only vars of that
3018          * relation are considered "real" vars.
3019          */
3020         varnos = pull_varnos(basenode);
3021
3022         onerel = NULL;
3023
3024         switch (bms_membership(varnos))
3025         {
3026                 case BMS_EMPTY_SET:
3027                         /* No Vars at all ... must be pseudo-constant clause */
3028                         break;
3029                 case BMS_SINGLETON:
3030                         if (varRelid == 0 || bms_is_member(varRelid, varnos))
3031                         {
3032                                 onerel = find_base_rel(root,
3033                                    (varRelid ? varRelid : bms_singleton_member(varnos)));
3034                                 vardata->rel = onerel;
3035                                 node = basenode; /* strip any relabeling */
3036                         }
3037                         /* else treat it as a constant */
3038                         break;
3039                 case BMS_MULTIPLE:
3040                         if (varRelid == 0)
3041                         {
3042                                 /* treat it as a variable of a join relation */
3043                                 vardata->rel = find_join_rel(root, varnos);
3044                                 node = basenode; /* strip any relabeling */
3045                         }
3046                         else if (bms_is_member(varRelid, varnos))
3047                         {
3048                                 /* ignore the vars belonging to other relations */
3049                                 vardata->rel = find_base_rel(root, varRelid);
3050                                 node = basenode; /* strip any relabeling */
3051                                 /* note: no point in expressional-index search here */
3052                         }
3053                         /* else treat it as a constant */
3054                         break;
3055         }
3056
3057         bms_free(varnos);
3058
3059         vardata->var = node;
3060         vardata->atttype = exprType(node);
3061         vardata->atttypmod = exprTypmod(node);
3062
3063         if (onerel)
3064         {
3065                 /*
3066                  * We have an expression in vars of a single relation.  Try to
3067                  * match it to expressional index columns, in hopes of finding
3068                  * some statistics.
3069                  *
3070                  * XXX it's conceivable that there are multiple matches with
3071                  * different index opclasses; if so, we need to pick one that
3072                  * matches the operator we are estimating for.  FIXME later.
3073                  */
3074                 ListCell   *ilist;
3075
3076                 foreach(ilist, onerel->indexlist)
3077                 {
3078                         IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3079                         ListCell   *indexpr_item;
3080                         int                     pos;
3081
3082                         indexpr_item = list_head(index->indexprs);
3083                         if (indexpr_item == NULL)
3084                                 continue;               /* no expressions here... */
3085
3086                         /*
3087                          * Ignore partial indexes since they probably don't reflect
3088                          * whole-relation statistics.  Possibly reconsider this later.
3089                          */
3090                         if (index->indpred)
3091                                 continue;
3092
3093                         for (pos = 0; pos < index->ncolumns; pos++)
3094                         {
3095                                 if (index->indexkeys[pos] == 0)
3096                                 {
3097                                         Node       *indexkey;
3098
3099                                         if (indexpr_item == NULL)
3100                                                 elog(ERROR, "too few entries in indexprs list");
3101                                         indexkey = (Node *) lfirst(indexpr_item);
3102                                         if (indexkey && IsA(indexkey, RelabelType))
3103                                                 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3104                                         if (equal(node, indexkey))
3105                                         {
3106                                                 /*
3107                                                  * Found a match ... is it a unique index? Tests
3108                                                  * here should match has_unique_index().
3109                                                  */
3110                                                 if (index->unique &&
3111                                                         index->ncolumns == 1 &&
3112                                                         index->indpred == NIL)
3113                                                         vardata->isunique = true;
3114                                                 /* Has it got stats? */
3115                                                 vardata->statsTuple = SearchSysCache(STATRELATT,
3116                                                                            ObjectIdGetDatum(index->indexoid),
3117                                                                                                   Int16GetDatum(pos + 1),
3118                                                                                                                          0, 0);
3119                                                 if (vardata->statsTuple)
3120                                                         break;
3121                                         }
3122                                         indexpr_item = lnext(indexpr_item);
3123                                 }
3124                         }
3125                         if (vardata->statsTuple)
3126                                 break;
3127                 }
3128         }
3129 }
3130
3131 /*
3132  * get_variable_numdistinct
3133  *        Estimate the number of distinct values of a variable.
3134  *
3135  * vardata: results of examine_variable
3136  *
3137  * NB: be careful to produce an integral result, since callers may compare
3138  * the result to exact integer counts.
3139  */
3140 static double
3141 get_variable_numdistinct(VariableStatData *vardata)
3142 {
3143         double          stadistinct;
3144         double          ntuples;
3145
3146         /*
3147          * Determine the stadistinct value to use.      There are cases where we
3148          * can get an estimate even without a pg_statistic entry, or can get a
3149          * better value than is in pg_statistic.
3150          */
3151         if (HeapTupleIsValid(vardata->statsTuple))
3152         {
3153                 /* Use the pg_statistic entry */
3154                 Form_pg_statistic stats;
3155
3156                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3157                 stadistinct = stats->stadistinct;
3158         }
3159         else if (vardata->vartype == BOOLOID)
3160         {
3161                 /*
3162                  * Special-case boolean columns: presumably, two distinct values.
3163                  *
3164                  * Are there any other datatypes we should wire in special estimates
3165                  * for?
3166                  */
3167                 stadistinct = 2.0;
3168         }
3169         else
3170         {
3171                 /*
3172                  * We don't keep statistics for system columns, but in some cases
3173                  * we can infer distinctness anyway.
3174                  */
3175                 if (vardata->var && IsA(vardata->var, Var))
3176                 {
3177                         switch (((Var *) vardata->var)->varattno)
3178                         {
3179                                 case ObjectIdAttributeNumber:
3180                                 case SelfItemPointerAttributeNumber:
3181                                         stadistinct = -1.0; /* unique */
3182                                         break;
3183                                 case TableOidAttributeNumber:
3184                                         stadistinct = 1.0;      /* only 1 value */
3185                                         break;
3186                                 default:
3187                                         stadistinct = 0.0;      /* means "unknown" */
3188                                         break;
3189                         }
3190                 }
3191                 else
3192                         stadistinct = 0.0;      /* means "unknown" */
3193
3194                 /*
3195                  * XXX consider using estimate_num_groups on expressions?
3196                  */
3197         }
3198
3199         /*
3200          * If there is a unique index for the variable, assume it is unique no
3201          * matter what pg_statistic says (the statistics could be out of
3202          * date).  Can skip search if we already think it's unique.
3203          */
3204         if (stadistinct != -1.0)
3205         {
3206                 if (vardata->isunique)
3207                         stadistinct = -1.0;
3208                 else if (vardata->var && IsA(vardata->var, Var) &&
3209                                  vardata->rel &&
3210                                  has_unique_index(vardata->rel,
3211                                                                   ((Var *) vardata->var)->varattno))
3212                         stadistinct = -1.0;
3213         }
3214
3215         /*
3216          * If we had an absolute estimate, use that.
3217          */
3218         if (stadistinct > 0.0)
3219                 return stadistinct;
3220
3221         /*
3222          * Otherwise we need to get the relation size; punt if not available.
3223          */
3224         if (vardata->rel == NULL)
3225                 return DEFAULT_NUM_DISTINCT;
3226         ntuples = vardata->rel->tuples;
3227         if (ntuples <= 0.0)
3228                 return DEFAULT_NUM_DISTINCT;
3229
3230         /*
3231          * If we had a relative estimate, use that.
3232          */
3233         if (stadistinct < 0.0)
3234                 return floor((-stadistinct * ntuples) + 0.5);
3235
3236         /*
3237          * With no data, estimate ndistinct = ntuples if the table is small,
3238          * else use default.
3239          */
3240         if (ntuples < DEFAULT_NUM_DISTINCT)
3241                 return ntuples;
3242
3243         return DEFAULT_NUM_DISTINCT;
3244 }
3245
3246 /*
3247  * get_variable_maximum
3248  *              Estimate the maximum value of the specified variable.
3249  *              If successful, store value in *max and return TRUE.
3250  *              If no data available, return FALSE.
3251  *
3252  * sortop is the "<" comparison operator to use.  (To extract the
3253  * minimum instead of the maximum, just pass the ">" operator instead.)
3254  */
3255 static bool
3256 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3257                                          Oid sortop, Datum *max)
3258 {
3259         Datum           tmax = 0;
3260         bool            have_max = false;
3261         Form_pg_statistic stats;
3262         int16           typLen;
3263         bool            typByVal;
3264         Datum      *values;
3265         int                     nvalues;
3266         int                     i;
3267
3268         if (!HeapTupleIsValid(vardata->statsTuple))
3269         {
3270                 /* no stats available, so default result */
3271                 return false;
3272         }
3273         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3274
3275         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3276
3277         /*
3278          * If there is a histogram, grab the last or first value as
3279          * appropriate.
3280          *
3281          * If there is a histogram that is sorted with some other operator than
3282          * the one we want, fail --- this suggests that there is data we can't
3283          * use.
3284          */
3285         if (get_attstatsslot(vardata->statsTuple,
3286                                                  vardata->atttype, vardata->atttypmod,
3287                                                  STATISTIC_KIND_HISTOGRAM, sortop,
3288                                                  &values, &nvalues,
3289                                                  NULL, NULL))
3290         {
3291                 if (nvalues > 0)
3292                 {
3293                         tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3294                         have_max = true;
3295                 }
3296                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3297         }
3298         else
3299         {
3300                 Oid                     rsortop = get_commutator(sortop);
3301
3302                 if (OidIsValid(rsortop) &&
3303                         get_attstatsslot(vardata->statsTuple,
3304                                                          vardata->atttype, vardata->atttypmod,
3305                                                          STATISTIC_KIND_HISTOGRAM, rsortop,
3306                                                          &values, &nvalues,
3307                                                          NULL, NULL))
3308                 {
3309                         if (nvalues > 0)
3310                         {
3311                                 tmax = datumCopy(values[0], typByVal, typLen);
3312                                 have_max = true;
3313                         }
3314                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3315                 }
3316                 else if (get_attstatsslot(vardata->statsTuple,
3317                                                                   vardata->atttype, vardata->atttypmod,
3318                                                                   STATISTIC_KIND_HISTOGRAM, InvalidOid,
3319                                                                   &values, &nvalues,
3320                                                                   NULL, NULL))
3321                 {
3322                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3323                         return false;
3324                 }
3325         }
3326
3327         /*
3328          * If we have most-common-values info, look for a large MCV.  This is
3329          * needed even if we also have a histogram, since the histogram
3330          * excludes the MCVs.  However, usually the MCVs will not be the
3331          * extreme values, so avoid unnecessary data copying.
3332          */
3333         if (get_attstatsslot(vardata->statsTuple,
3334                                                  vardata->atttype, vardata->atttypmod,
3335                                                  STATISTIC_KIND_MCV, InvalidOid,
3336                                                  &values, &nvalues,
3337                                                  NULL, NULL))
3338         {
3339                 bool            large_mcv = false;
3340                 FmgrInfo        opproc;
3341
3342                 fmgr_info(get_opcode(sortop), &opproc);
3343
3344                 for (i = 0; i < nvalues; i++)
3345                 {
3346                         if (!have_max)
3347                         {
3348                                 tmax = values[i];
3349                                 large_mcv = have_max = true;
3350                         }
3351                         else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3352                         {
3353                                 tmax = values[i];
3354                                 large_mcv = true;
3355                         }
3356                 }
3357                 if (large_mcv)
3358                         tmax = datumCopy(tmax, typByVal, typLen);
3359                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3360         }
3361
3362         *max = tmax;
3363         return have_max;
3364 }
3365
3366
3367 /*-------------------------------------------------------------------------
3368  *
3369  * Pattern analysis functions
3370  *
3371  * These routines support analysis of LIKE and regular-expression patterns
3372  * by the planner/optimizer.  It's important that they agree with the
3373  * regular-expression code in backend/regex/ and the LIKE code in
3374  * backend/utils/adt/like.c.
3375  *
3376  * Note that the prefix-analysis functions are called from
3377  * backend/optimizer/path/indxpath.c as well as from routines in this file.
3378  *
3379  *-------------------------------------------------------------------------
3380  */
3381
3382 /*
3383  * Extract the fixed prefix, if any, for a pattern.
3384  *
3385  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3386  *      or to NULL if no fixed prefix exists for the pattern.
3387  * *rest is set to a palloc'd Const representing the remainder of the pattern
3388  *      after the portion describing the fixed prefix.
3389  * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3390  *
3391  * The return value distinguishes no fixed prefix, a partial prefix,
3392  * or an exact-match-only pattern.
3393  */
3394
3395 static Pattern_Prefix_Status
3396 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3397                                   Const **prefix_const, Const **rest_const)
3398 {
3399         char       *match;
3400         char       *patt;
3401         int                     pattlen;
3402         char       *rest;
3403         Oid                     typeid = patt_const->consttype;
3404         int                     pos,
3405                                 match_pos;
3406
3407         /* the right-hand const is type text or bytea */
3408         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3409
3410         if (typeid == BYTEAOID && case_insensitive)
3411                 ereport(ERROR,
3412                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3413                 errmsg("case insensitive matching not supported on type bytea")));
3414
3415         if (typeid != BYTEAOID)
3416         {
3417                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3418                 pattlen = strlen(patt);
3419         }
3420         else
3421         {
3422                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
3423
3424                 pattlen = VARSIZE(bstr) - VARHDRSZ;
3425                 if (pattlen > 0)
3426                 {
3427                         patt = (char *) palloc(pattlen);
3428                         memcpy(patt, VARDATA(bstr), pattlen);
3429                 }
3430                 else
3431                         patt = NULL;
3432
3433                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3434                         pfree(bstr);
3435         }
3436
3437         match = palloc(pattlen + 1);
3438         match_pos = 0;
3439         for (pos = 0; pos < pattlen; pos++)
3440         {
3441                 /* % and _ are wildcard characters in LIKE */
3442                 if (patt[pos] == '%' ||
3443                         patt[pos] == '_')
3444                         break;
3445
3446                 /* Backslash escapes the next character */
3447                 if (patt[pos] == '\\')
3448                 {
3449                         pos++;
3450                         if (patt[pos] == '\0' && typeid != BYTEAOID)
3451                                 break;
3452                 }
3453
3454                 /*
3455                  * XXX I suspect isalpha() is not an adequately locale-sensitive
3456                  * test for characters that can vary under case folding?
3457                  */
3458                 if (case_insensitive && isalpha((unsigned char) patt[pos]))
3459                         break;
3460
3461                 /*
3462                  * NOTE: this code used to think that %% meant a literal %, but
3463                  * textlike() itself does not think that, and the SQL92 spec
3464                  * doesn't say any such thing either.
3465                  */
3466                 match[match_pos++] = patt[pos];
3467         }
3468
3469         match[match_pos] = '\0';
3470         rest = &patt[pos];
3471
3472         if (typeid != BYTEAOID)
3473         {
3474                 *prefix_const = string_to_const(match, typeid);
3475                 *rest_const = string_to_const(rest, typeid);
3476         }
3477         else
3478         {
3479                 *prefix_const = string_to_bytea_const(match, match_pos);
3480                 *rest_const = string_to_bytea_const(rest, pattlen - pos);
3481         }
3482
3483         if (patt != NULL)
3484                 pfree(patt);
3485         pfree(match);
3486
3487         /* in LIKE, an empty pattern is an exact match! */
3488         if (pos == pattlen)
3489                 return Pattern_Prefix_Exact;    /* reached end of pattern, so
3490                                                                                  * exact */
3491
3492         if (match_pos > 0)
3493                 return Pattern_Prefix_Partial;
3494
3495         return Pattern_Prefix_None;
3496 }
3497
3498 static Pattern_Prefix_Status
3499 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3500                                    Const **prefix_const, Const **rest_const)
3501 {
3502         char       *match;
3503         int                     pos,
3504                                 match_pos,
3505                                 prev_pos,
3506                                 prev_match_pos,
3507                                 paren_depth;
3508         char       *patt;
3509         char       *rest;
3510         Oid                     typeid = patt_const->consttype;
3511
3512         /*
3513          * Should be unnecessary, there are no bytea regex operators defined.
3514          * As such, it should be noted that the rest of this function has *not*
3515          * been made safe for binary (possibly NULL containing) strings.
3516          */
3517         if (typeid == BYTEAOID)
3518                 ereport(ERROR,
3519                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3520                                  errmsg("regular-expression matching not supported on type bytea")));
3521
3522         /* the right-hand const is type text for all of these */
3523         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3524
3525         /* Pattern must be anchored left */
3526         if (patt[0] != '^')
3527         {
3528                 rest = patt;
3529
3530                 *prefix_const = NULL;
3531                 *rest_const = string_to_const(rest, typeid);
3532
3533                 return Pattern_Prefix_None;
3534         }
3535
3536         /*
3537          * If unquoted | is present at paren level 0 in pattern, then there
3538          * are multiple alternatives for the start of the string.
3539          */
3540         paren_depth = 0;
3541         for (pos = 1; patt[pos]; pos++)
3542         {
3543                 if (patt[pos] == '|' && paren_depth == 0)
3544                 {
3545                         rest = patt;
3546
3547                         *prefix_const = NULL;
3548                         *rest_const = string_to_const(rest, typeid);
3549
3550                         return Pattern_Prefix_None;
3551                 }
3552                 else if (patt[pos] == '(')
3553                         paren_depth++;
3554                 else if (patt[pos] == ')' && paren_depth > 0)
3555                         paren_depth--;
3556                 else if (patt[pos] == '\\')
3557                 {
3558                         /* backslash quotes the next character */
3559                         pos++;
3560                         if (patt[pos] == '\0')
3561                                 break;
3562                 }
3563         }
3564
3565         /* OK, allocate space for pattern */
3566         match = palloc(strlen(patt) + 1);
3567         prev_match_pos = match_pos = 0;
3568
3569         /* note start at pos 1 to skip leading ^ */
3570         for (prev_pos = pos = 1; patt[pos]; )
3571         {
3572                 int             len;
3573
3574                 /*
3575                  * Check for characters that indicate multiple possible matches
3576                  * here. XXX I suspect isalpha() is not an adequately
3577                  * locale-sensitive test for characters that can vary under case
3578                  * folding?
3579                  */
3580                 if (patt[pos] == '.' ||
3581                         patt[pos] == '(' ||
3582                         patt[pos] == '[' ||
3583                         patt[pos] == '$' ||
3584                         (case_insensitive && isalpha((unsigned char) patt[pos])))
3585                         break;
3586
3587                 /*
3588                  * In AREs, backslash followed by alphanumeric is an escape, not
3589                  * a quoted character.  Must treat it as having multiple possible
3590                  * matches.
3591                  */
3592                 if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
3593                         break;
3594
3595                 /*
3596                  * Check for quantifiers.  Except for +, this means the preceding
3597                  * character is optional, so we must remove it from the prefix
3598                  * too!
3599                  */
3600                 if (patt[pos] == '*' ||
3601                         patt[pos] == '?' ||
3602                         patt[pos] == '{')
3603                 {
3604                         match_pos = prev_match_pos;
3605                         pos = prev_pos;
3606                         break;
3607                 }
3608                 if (patt[pos] == '+')
3609                 {
3610                         pos = prev_pos;
3611                         break;
3612                 }
3613                 if (patt[pos] == '\\')
3614                 {
3615                         /* backslash quotes the next character */
3616                         pos++;
3617                         if (patt[pos] == '\0')
3618                                 break;
3619                 }
3620                 /* save position in case we need to back up on next loop cycle */
3621                 prev_match_pos = match_pos;
3622                 prev_pos = pos;
3623                 /* must use encoding-aware processing here */
3624                 len = pg_mblen(&patt[pos]);
3625                 memcpy(&match[match_pos], &patt[pos], len);
3626                 match_pos += len;
3627                 pos += len;
3628         }
3629
3630         match[match_pos] = '\0';
3631         rest = &patt[pos];
3632
3633         if (patt[pos] == '$' && patt[pos + 1] == '\0')
3634         {
3635                 rest = &patt[pos + 1];
3636
3637                 *prefix_const = string_to_const(match, typeid);
3638                 *rest_const = string_to_const(rest, typeid);
3639
3640                 pfree(patt);
3641                 pfree(match);
3642
3643                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
3644         }
3645
3646         *prefix_const = string_to_const(match, typeid);
3647         *rest_const = string_to_const(rest, typeid);
3648
3649         pfree(patt);
3650         pfree(match);
3651
3652         if (match_pos > 0)
3653                 return Pattern_Prefix_Partial;
3654
3655         return Pattern_Prefix_None;
3656 }
3657
3658 Pattern_Prefix_Status
3659 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
3660                                          Const **prefix, Const **rest)
3661 {
3662         Pattern_Prefix_Status result;
3663
3664         switch (ptype)
3665         {
3666                 case Pattern_Type_Like:
3667                         result = like_fixed_prefix(patt, false, prefix, rest);
3668                         break;
3669                 case Pattern_Type_Like_IC:
3670                         result = like_fixed_prefix(patt, true, prefix, rest);
3671                         break;
3672                 case Pattern_Type_Regex:
3673                         result = regex_fixed_prefix(patt, false, prefix, rest);
3674                         break;
3675                 case Pattern_Type_Regex_IC:
3676                         result = regex_fixed_prefix(patt, true, prefix, rest);
3677                         break;
3678                 default:
3679                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
3680                         result = Pattern_Prefix_None;           /* keep compiler quiet */
3681                         break;
3682         }
3683         return result;
3684 }
3685
3686 /*
3687  * Estimate the selectivity of a fixed prefix for a pattern match.
3688  *
3689  * A fixed prefix "foo" is estimated as the selectivity of the expression
3690  * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
3691  *
3692  * We use the >= and < operators from the specified btree opclass to do the
3693  * estimation.  The given variable and Const must be of the associated
3694  * datatype.
3695  *
3696  * XXX Note: we make use of the upper bound to estimate operator selectivity
3697  * even if the locale is such that we cannot rely on the upper-bound string.
3698  * The selectivity only needs to be approximately right anyway, so it seems
3699  * more useful to use the upper-bound code than not.
3700  */
3701 static Selectivity
3702 prefix_selectivity(PlannerInfo *root, Node *variable,
3703                                    Oid opclass, Const *prefixcon)
3704 {
3705         Selectivity prefixsel;
3706         Oid                     cmpopr;
3707         List       *cmpargs;
3708         Const      *greaterstrcon;
3709
3710         cmpopr = get_opclass_member(opclass, InvalidOid,
3711                                                                 BTGreaterEqualStrategyNumber);
3712         if (cmpopr == InvalidOid)
3713                 elog(ERROR, "no >= operator for opclass %u", opclass);
3714         cmpargs = list_make2(variable, prefixcon);
3715         /* Assume scalargtsel is appropriate for all supported types */
3716         prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
3717                                                                                                    PointerGetDatum(root),
3718                                                                                                 ObjectIdGetDatum(cmpopr),
3719                                                                                                 PointerGetDatum(cmpargs),
3720                                                                                                    Int32GetDatum(0)));
3721
3722         /*-------
3723          * If we can create a string larger than the prefix, say
3724          *      "x < greaterstr".
3725          *-------
3726          */
3727         greaterstrcon = make_greater_string(prefixcon);
3728         if (greaterstrcon)
3729         {
3730                 Selectivity topsel;
3731
3732                 cmpopr = get_opclass_member(opclass, InvalidOid,
3733                                                                         BTLessStrategyNumber);
3734                 if (cmpopr == InvalidOid)
3735                         elog(ERROR, "no < operator for opclass %u", opclass);
3736                 cmpargs = list_make2(variable, greaterstrcon);
3737                 /* Assume scalarltsel is appropriate for all supported types */
3738                 topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
3739                                                                                                         PointerGetDatum(root),
3740                                                                                                 ObjectIdGetDatum(cmpopr),
3741                                                                                                 PointerGetDatum(cmpargs),
3742                                                                                                         Int32GetDatum(0)));
3743
3744                 /*
3745                  * Merge the two selectivities in the same way as for a range
3746                  * query (see clauselist_selectivity()).
3747                  */
3748                 prefixsel = topsel + prefixsel - 1.0;
3749
3750                 /* Adjust for double-exclusion of NULLs */
3751                 prefixsel += nulltestsel(root, IS_NULL, variable, 0);
3752
3753                 /*
3754                  * A zero or slightly negative prefixsel should be converted into
3755                  * a small positive value; we probably are dealing with a very
3756                  * tight range and got a bogus result due to roundoff errors.
3757                  * However, if prefixsel is very negative, then we probably have
3758                  * default selectivity estimates on one or both sides of the
3759                  * range.  In that case, insert a not-so-wildly-optimistic default
3760                  * estimate.
3761                  */
3762                 if (prefixsel <= 0.0)
3763                 {
3764                         if (prefixsel < -0.01)
3765                         {
3766                                 /*
3767                                  * No data available --- use a default estimate that is
3768                                  * small, but not real small.
3769                                  */
3770                                 prefixsel = 0.005;
3771                         }
3772                         else
3773                         {
3774                                 /*
3775                                  * It's just roundoff error; use a small positive value
3776                                  */
3777                                 prefixsel = 1.0e-10;
3778                         }
3779                 }
3780         }
3781
3782         return prefixsel;
3783 }
3784
3785
3786 /*
3787  * Estimate the selectivity of a pattern of the specified type.
3788  * Note that any fixed prefix of the pattern will have been removed already.
3789  *
3790  * For now, we use a very simplistic approach: fixed characters reduce the
3791  * selectivity a good deal, character ranges reduce it a little,
3792  * wildcards (such as % for LIKE or .* for regex) increase it.
3793  */
3794
3795 #define FIXED_CHAR_SEL  0.20    /* about 1/5 */
3796 #define CHAR_RANGE_SEL  0.25
3797 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match
3798                                                                  * end-of-string */
3799 #define FULL_WILDCARD_SEL 5.0
3800 #define PARTIAL_WILDCARD_SEL 2.0
3801
3802 static Selectivity
3803 like_selectivity(Const *patt_const, bool case_insensitive)
3804 {
3805         Selectivity sel = 1.0;
3806         int                     pos;
3807         int                     start;
3808         Oid                     typeid = patt_const->consttype;
3809         char       *patt;
3810         int                     pattlen;
3811
3812         /* the right-hand const is type text or bytea */
3813         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3814
3815         if (typeid == BYTEAOID && case_insensitive)
3816                 ereport(ERROR,
3817                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3818                 errmsg("case insensitive matching not supported on type bytea")));
3819
3820         if (typeid != BYTEAOID)
3821         {
3822                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3823                 pattlen = strlen(patt);
3824         }
3825         else
3826         {
3827                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
3828
3829                 pattlen = VARSIZE(bstr) - VARHDRSZ;
3830                 if (pattlen > 0)
3831                 {
3832                         patt = (char *) palloc(pattlen);
3833                         memcpy(patt, VARDATA(bstr), pattlen);
3834                 }
3835                 else
3836                         patt = NULL;
3837
3838                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3839                         pfree(bstr);
3840         }
3841         /* patt should never be NULL in practice */
3842         Assert(patt != NULL);
3843
3844         /* Skip any leading %; it's already factored into initial sel */
3845         start = (*patt == '%') ? 1 : 0;
3846         for (pos = start; pos < pattlen; pos++)
3847         {
3848                 /* % and _ are wildcard characters in LIKE */
3849                 if (patt[pos] == '%')
3850                         sel *= FULL_WILDCARD_SEL;
3851                 else if (patt[pos] == '_')
3852                         sel *= ANY_CHAR_SEL;
3853                 else if (patt[pos] == '\\')
3854                 {
3855                         /* Backslash quotes the next character */
3856                         pos++;
3857                         if (patt[pos] == '\0' && typeid != BYTEAOID)
3858                                 break;
3859                         sel *= FIXED_CHAR_SEL;
3860                 }
3861                 else
3862                         sel *= FIXED_CHAR_SEL;
3863         }
3864         /* Could get sel > 1 if multiple wildcards */
3865         if (sel > 1.0)
3866                 sel = 1.0;
3867         return sel;
3868 }
3869
3870 static Selectivity
3871 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
3872 {
3873         Selectivity sel = 1.0;
3874         int                     paren_depth = 0;
3875         int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
3876         int                     pos;
3877
3878         for (pos = 0; pos < pattlen; pos++)
3879         {
3880                 if (patt[pos] == '(')
3881                 {
3882                         if (paren_depth == 0)
3883                                 paren_pos = pos;        /* remember start of parenthesized item */
3884                         paren_depth++;
3885                 }
3886                 else if (patt[pos] == ')' && paren_depth > 0)
3887                 {
3888                         paren_depth--;
3889                         if (paren_depth == 0)
3890                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
3891                                                                                          pos - (paren_pos + 1),
3892                                                                                          case_insensitive);
3893                 }
3894                 else if (patt[pos] == '|' && paren_depth == 0)
3895                 {
3896                         /*
3897                          * If unquoted | is present at paren level 0 in pattern, we
3898                          * have multiple alternatives; sum their probabilities.
3899                          */
3900                         sel += regex_selectivity_sub(patt + (pos + 1),
3901                                                                                  pattlen - (pos + 1),
3902                                                                                  case_insensitive);
3903                         break;                          /* rest of pattern is now processed */
3904                 }
3905                 else if (patt[pos] == '[')
3906                 {
3907                         bool            negclass = false;
3908
3909                         if (patt[++pos] == '^')
3910                         {
3911                                 negclass = true;
3912                                 pos++;
3913                         }
3914                         if (patt[pos] == ']')           /* ']' at start of class is not
3915                                                                                  * special */
3916                                 pos++;
3917                         while (pos < pattlen && patt[pos] != ']')
3918                                 pos++;
3919                         if (paren_depth == 0)
3920                                 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
3921                 }
3922                 else if (patt[pos] == '.')
3923                 {
3924                         if (paren_depth == 0)
3925                                 sel *= ANY_CHAR_SEL;
3926                 }
3927                 else if (patt[pos] == '*' ||
3928                                  patt[pos] == '?' ||
3929                                  patt[pos] == '+')
3930                 {
3931                         /* Ought to be smarter about quantifiers... */
3932                         if (paren_depth == 0)
3933                                 sel *= PARTIAL_WILDCARD_SEL;
3934                 }
3935                 else if (patt[pos] == '{')
3936                 {
3937                         while (pos < pattlen && patt[pos] != '}')
3938                                 pos++;
3939                         if (paren_depth == 0)
3940                                 sel *= PARTIAL_WILDCARD_SEL;
3941                 }
3942                 else if (patt[pos] == '\\')
3943                 {
3944                         /* backslash quotes the next character */
3945                         pos++;
3946                         if (pos >= pattlen)
3947                                 break;
3948                         if (paren_depth == 0)
3949                                 sel *= FIXED_CHAR_SEL;
3950                 }
3951                 else
3952                 {
3953                         if (paren_depth == 0)
3954                                 sel *= FIXED_CHAR_SEL;
3955                 }
3956         }
3957         /* Could get sel > 1 if multiple wildcards */
3958         if (sel > 1.0)
3959                 sel = 1.0;
3960         return sel;
3961 }
3962
3963 static Selectivity
3964 regex_selectivity(Const *patt_const, bool case_insensitive)
3965 {
3966         Selectivity sel;
3967         char       *patt;
3968         int                     pattlen;
3969         Oid                     typeid = patt_const->consttype;
3970
3971         /*
3972          * Should be unnecessary, there are no bytea regex operators defined.
3973          * As such, it should be noted that the rest of this function has *not*
3974          * been made safe for binary (possibly NULL containing) strings.
3975          */
3976         if (typeid == BYTEAOID)
3977                 ereport(ERROR,
3978                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3979                                  errmsg("regular-expression matching not supported on type bytea")));
3980
3981         /* the right-hand const is type text for all of these */
3982         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3983         pattlen = strlen(patt);
3984
3985         /* If patt doesn't end with $, consider it to have a trailing wildcard */
3986         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
3987                 (pattlen == 1 || patt[pattlen - 2] != '\\'))
3988         {
3989                 /* has trailing $ */
3990                 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
3991         }
3992         else
3993         {
3994                 /* no trailing $ */
3995                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
3996                 sel *= FULL_WILDCARD_SEL;
3997                 if (sel > 1.0)
3998                         sel = 1.0;
3999         }
4000         return sel;
4001 }
4002
4003 static Selectivity
4004 pattern_selectivity(Const *patt, Pattern_Type ptype)
4005 {
4006         Selectivity result;
4007
4008         switch (ptype)
4009         {
4010                 case Pattern_Type_Like:
4011                         result = like_selectivity(patt, false);
4012                         break;
4013                 case Pattern_Type_Like_IC:
4014                         result = like_selectivity(patt, true);
4015                         break;
4016                 case Pattern_Type_Regex:
4017                         result = regex_selectivity(patt, false);
4018                         break;
4019                 case Pattern_Type_Regex_IC:
4020                         result = regex_selectivity(patt, true);
4021                         break;
4022                 default:
4023                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4024                         result = 1.0;           /* keep compiler quiet */
4025                         break;
4026         }
4027         return result;
4028 }
4029
4030
4031 /*
4032  * Try to generate a string greater than the given string or any
4033  * string it is a prefix of.  If successful, return a palloc'd string
4034  * in the form of a Const pointer; else return NULL.
4035  *
4036  * The key requirement here is that given a prefix string, say "foo",
4037  * we must be able to generate another string "fop" that is greater
4038  * than all strings "foobar" starting with "foo".
4039  *
4040  * If we max out the righthand byte, truncate off the last character
4041  * and start incrementing the next.  For example, if "z" were the last
4042  * character in the sort order, then we could produce "foo" as a
4043  * string greater than "fonz".
4044  *
4045  * This could be rather slow in the worst case, but in most cases we
4046  * won't have to try more than one or two strings before succeeding.
4047  *
4048  * NOTE: at present this assumes we are in the C locale, so that simple
4049  * bytewise comparison applies.  However, we might be in a multibyte
4050  * encoding such as UTF8, so we do have to watch out for generating
4051  * invalid encoding sequences.
4052  */
4053 Const *
4054 make_greater_string(const Const *str_const)
4055 {
4056         Oid                     datatype = str_const->consttype;
4057         char       *workstr;
4058         int                     len;
4059
4060         /* Get the string and a modifiable copy */
4061         if (datatype == NAMEOID)
4062         {
4063                 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4064                                                                                                  str_const->constvalue));
4065                 len = strlen(workstr);
4066         }
4067         else if (datatype == BYTEAOID)
4068         {
4069                 bytea      *bstr = DatumGetByteaP(str_const->constvalue);
4070
4071                 len = VARSIZE(bstr) - VARHDRSZ;
4072                 if (len > 0)
4073                 {
4074                         workstr = (char *) palloc(len);
4075                         memcpy(workstr, VARDATA(bstr), len);
4076                 }
4077                 else
4078                         workstr = NULL;
4079
4080                 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4081                         pfree(bstr);
4082         }
4083         else
4084         {
4085                 workstr = DatumGetCString(DirectFunctionCall1(textout,
4086                                                                                                  str_const->constvalue));
4087                 len = strlen(workstr);
4088         }
4089
4090         while (len > 0)
4091         {
4092                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4093                 unsigned char savelastchar = *lastchar;
4094
4095                 /*
4096                  * Try to generate a larger string by incrementing the last byte.
4097                  */
4098                 while (*lastchar < (unsigned char) 255)
4099                 {
4100                         Const      *workstr_const;
4101
4102                         (*lastchar)++;
4103
4104                         if (datatype != BYTEAOID)
4105                         {
4106                                 /* do not generate invalid encoding sequences */
4107                                 if (!pg_verifymbstr((const unsigned char *) workstr,
4108                                                                         len, true))
4109                                         continue;
4110                                 workstr_const = string_to_const(workstr, datatype);
4111                         }
4112                         else
4113                                 workstr_const = string_to_bytea_const(workstr, len);
4114
4115                         pfree(workstr);
4116                         return workstr_const;
4117                 }
4118
4119                 /* restore last byte so we don't confuse pg_mbcliplen */
4120                 *lastchar = savelastchar;
4121
4122                 /*
4123                  * Truncate off the last character, which might be more than 1
4124                  * byte, depending on the character encoding.
4125                  */
4126                 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4127                         len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
4128                 else
4129                         len -= 1;
4130
4131                 if (datatype != BYTEAOID)
4132                         workstr[len] = '\0';
4133         }
4134
4135         /* Failed... */
4136         if (workstr != NULL)
4137                 pfree(workstr);
4138
4139         return NULL;
4140 }
4141
4142 /*
4143  * Generate a Datum of the appropriate type from a C string.
4144  * Note that all of the supported types are pass-by-ref, so the
4145  * returned value should be pfree'd if no longer needed.
4146  */
4147 static Datum
4148 string_to_datum(const char *str, Oid datatype)
4149 {
4150         Assert(str != NULL);
4151
4152         /*
4153          * We cheat a little by assuming that textin() will do for bpchar and
4154          * varchar constants too...
4155          */
4156         if (datatype == NAMEOID)
4157                 return DirectFunctionCall1(namein, CStringGetDatum(str));
4158         else if (datatype == BYTEAOID)
4159                 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4160         else
4161                 return DirectFunctionCall1(textin, CStringGetDatum(str));
4162 }
4163
4164 /*
4165  * Generate a Const node of the appropriate type from a C string.
4166  */
4167 static Const *
4168 string_to_const(const char *str, Oid datatype)
4169 {
4170         Datum           conval = string_to_datum(str, datatype);
4171
4172         return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4173                                          conval, false, false);
4174 }
4175
4176 /*
4177  * Generate a Const node of bytea type from a binary C string and a length.
4178  */
4179 static Const *
4180 string_to_bytea_const(const char *str, size_t str_len)
4181 {
4182         bytea      *bstr = palloc(VARHDRSZ + str_len);
4183         Datum           conval;
4184
4185         memcpy(VARDATA(bstr), str, str_len);
4186         VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4187         conval = PointerGetDatum(bstr);
4188
4189         return makeConst(BYTEAOID, -1, conval, false, false);
4190 }
4191
4192 /*-------------------------------------------------------------------------
4193  *
4194  * Index cost estimation functions
4195  *
4196  * genericcostestimate is a general-purpose estimator for use when we
4197  * don't have any better idea about how to estimate.  Index-type-specific
4198  * knowledge can be incorporated in the type-specific routines.
4199  *
4200  * One bit of index-type-specific knowledge we can relatively easily use
4201  * in genericcostestimate is the estimate of the number of index tuples
4202  * visited.  If numIndexTuples is not 0 then it is used as the estimate,
4203  * otherwise we compute a generic estimate.
4204  *
4205  *-------------------------------------------------------------------------
4206  */
4207
4208 static void
4209 genericcostestimate(PlannerInfo *root,
4210                                         IndexOptInfo *index, List *indexQuals,
4211                                         double numIndexTuples,
4212                                         Cost *indexStartupCost,
4213                                         Cost *indexTotalCost,
4214                                         Selectivity *indexSelectivity,
4215                                         double *indexCorrelation)
4216 {
4217         double          numIndexPages;
4218         QualCost        index_qual_cost;
4219         double          qual_op_cost;
4220         double          qual_arg_cost;
4221         List       *selectivityQuals;
4222
4223         /*
4224          * If the index is partial, AND the index predicate with the
4225          * explicitly given indexquals to produce a more accurate idea of the
4226          * index selectivity.  This may produce redundant clauses.      We get rid
4227          * of exact duplicates in the code below.  We expect that most cases
4228          * of partial redundancy (such as "x < 4" from the qual and "x < 5"
4229          * from the predicate) will be recognized and handled correctly by
4230          * clauselist_selectivity().  This assumption is somewhat fragile,
4231          * since it depends on predicate_implied_by() and clauselist_selectivity()
4232          * having similar capabilities, and there are certainly many cases where
4233          * we will end up with a too-low selectivity estimate.  This will bias the
4234          * system in favor of using partial indexes where possible, which is not
4235          * necessarily a bad thing. But it'd be nice to do better someday.
4236          *
4237          * Note that index->indpred and indexQuals are both in implicit-AND form,
4238          * so ANDing them together just takes merging the lists.  However,
4239          * eliminating duplicates is a bit trickier because indexQuals
4240          * contains RestrictInfo nodes and the indpred does not.  It is okay
4241          * to pass a mixed list to clauselist_selectivity, but we have to work
4242          * a bit to generate a list without logical duplicates.  (We could
4243          * just list_union indpred and strippedQuals, but then we'd not get
4244          * caching of per-qual selectivity estimates.)
4245          */
4246         if (index->indpred != NIL)
4247         {
4248                 List       *strippedQuals;
4249                 List       *predExtraQuals;
4250
4251                 strippedQuals = get_actual_clauses(indexQuals);
4252                 predExtraQuals = list_difference(index->indpred, strippedQuals);
4253                 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4254         }
4255         else
4256                 selectivityQuals = indexQuals;
4257
4258         /* Estimate the fraction of main-table tuples that will be visited */
4259         *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4260                                                                                            index->rel->relid,
4261                                                                                            JOIN_INNER);
4262
4263         /*
4264          * If caller didn't give us an estimate, estimate the number of index
4265          * tuples that will be visited.  We do it in this rather peculiar-looking
4266          * way in order to get the right answer for partial indexes.
4267          */
4268         if (numIndexTuples <= 0.0)
4269                 numIndexTuples = *indexSelectivity * index->rel->tuples;
4270
4271         /*
4272          * We can bound the number of tuples by the index size in any case.
4273          * Also, always estimate at least one tuple is touched, even when
4274          * indexSelectivity estimate is tiny.
4275          */
4276         if (numIndexTuples > index->tuples)
4277                 numIndexTuples = index->tuples;
4278         if (numIndexTuples < 1.0)
4279                 numIndexTuples = 1.0;
4280
4281         /*
4282          * Estimate the number of index pages that will be retrieved.
4283          *
4284          * For all currently-supported index types, the first page of the index
4285          * is a metadata page, and we should figure on fetching that plus a
4286          * pro-rated fraction of the remaining pages.
4287          */
4288         if (index->pages > 1 && index->tuples > 0)
4289         {
4290                 numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
4291                 numIndexPages += 1;             /* count the metapage too */
4292                 numIndexPages = ceil(numIndexPages);
4293         }
4294         else
4295                 numIndexPages = 1.0;
4296
4297         /*
4298          * Compute the index access cost.
4299          *
4300          * Disk cost: our generic assumption is that the index pages will be read
4301          * sequentially, so they have cost 1.0 each, not random_page_cost.
4302          */
4303         *indexTotalCost = numIndexPages;
4304
4305         /*
4306          * CPU cost: any complex expressions in the indexquals will need to be
4307          * evaluated once at the start of the scan to reduce them to runtime
4308          * keys to pass to the index AM (see nodeIndexscan.c).  We model the
4309          * per-tuple CPU costs as cpu_index_tuple_cost plus one
4310          * cpu_operator_cost per indexqual operator.
4311          *
4312          * Note: this neglects the possible costs of rechecking lossy operators
4313          * and OR-clause expressions.  Detecting that that might be needed
4314          * seems more expensive than it's worth, though, considering all the
4315          * other inaccuracies here ...
4316          */
4317         cost_qual_eval(&index_qual_cost, indexQuals);
4318         qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4319         qual_arg_cost = index_qual_cost.startup +
4320                 index_qual_cost.per_tuple - qual_op_cost;
4321         if (qual_arg_cost < 0)          /* just in case... */
4322                 qual_arg_cost = 0;
4323         *indexStartupCost = qual_arg_cost;
4324         *indexTotalCost += qual_arg_cost;
4325         *indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost);
4326
4327         /*
4328          * Generic assumption about index correlation: there isn't any.
4329          */
4330         *indexCorrelation = 0.0;
4331 }
4332
4333
4334 Datum
4335 btcostestimate(PG_FUNCTION_ARGS)
4336 {
4337         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4338         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4339         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4340         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4341         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4342         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4343         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4344         Oid                     relid;
4345         AttrNumber      colnum;
4346         HeapTuple       tuple;
4347         double          numIndexTuples;
4348         List       *indexBoundQuals;
4349         int                     indexcol;
4350         bool            eqQualHere;
4351         ListCell   *l;
4352
4353         /*
4354          * For a btree scan, only leading '=' quals plus inequality quals
4355          * for the immediately next attribute contribute to index selectivity
4356          * (these are the "boundary quals" that determine the starting and
4357          * stopping points of the index scan).  Additional quals can suppress
4358          * visits to the heap, so it's OK to count them in indexSelectivity,
4359          * but they should not count for estimating numIndexTuples.  So we must
4360          * examine the given indexQuals to find out which ones count as boundary
4361          * quals.  We rely on the knowledge that they are given in index column
4362          * order.
4363          */
4364         indexBoundQuals = NIL;
4365         indexcol = 0;
4366         eqQualHere = false;
4367         foreach(l, indexQuals)
4368         {
4369                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4370                 Expr   *clause;
4371                 Oid             clause_op;
4372                 int             op_strategy;
4373
4374                 Assert(IsA(rinfo, RestrictInfo));
4375                 clause = rinfo->clause;
4376                 Assert(IsA(clause, OpExpr));
4377                 clause_op = ((OpExpr *) clause)->opno;
4378                 if (match_index_to_operand(get_leftop(clause), indexcol, index))
4379                 {
4380                         /* clause_op is correct */
4381                 }
4382                 else if (match_index_to_operand(get_rightop(clause), indexcol, index))
4383                 {
4384                         /* Must flip operator to get the opclass member */
4385                         clause_op = get_commutator(clause_op);
4386                 }
4387                 else
4388                 {
4389                         /* Must be past the end of quals for indexcol, try next */
4390                         if (!eqQualHere)
4391                                 break;                  /* done if no '=' qual for indexcol */
4392                         indexcol++;
4393                         eqQualHere = false;
4394                         if (match_index_to_operand(get_leftop(clause), indexcol, index))
4395                         {
4396                                 /* clause_op is correct */
4397                         }
4398                         else if (match_index_to_operand(get_rightop(clause),
4399                                                                                         indexcol, index))
4400                         {
4401                                 /* Must flip operator to get the opclass member */
4402                                 clause_op = get_commutator(clause_op);
4403                         }
4404                         else
4405                         {
4406                                 /* No quals for new indexcol, so we are done */
4407                                 break;
4408                         }
4409                 }
4410                 op_strategy = get_op_opclass_strategy(clause_op,
4411                                                                                           index->classlist[indexcol]);
4412                 Assert(op_strategy != 0);                       /* not a member of opclass?? */
4413                 if (op_strategy == BTEqualStrategyNumber)
4414                         eqQualHere = true;
4415                 indexBoundQuals = lappend(indexBoundQuals, rinfo);
4416         }
4417
4418         /*
4419          * If index is unique and we found an '=' clause for each column,
4420          * we can just assume numIndexTuples = 1 and skip the expensive
4421          * clauselist_selectivity calculations.
4422          */
4423         if (index->unique && indexcol == index->ncolumns - 1 && eqQualHere)
4424                 numIndexTuples = 1.0;
4425         else
4426         {
4427                 Selectivity btreeSelectivity;
4428
4429                 btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
4430                                                                                                   index->rel->relid,
4431                                                                                                   JOIN_INNER);
4432                 numIndexTuples = btreeSelectivity * index->rel->tuples;
4433         }
4434
4435         genericcostestimate(root, index, indexQuals, numIndexTuples,
4436                                                 indexStartupCost, indexTotalCost,
4437                                                 indexSelectivity, indexCorrelation);
4438
4439         /*
4440          * If we can get an estimate of the first column's ordering
4441          * correlation C from pg_statistic, estimate the index correlation as
4442          * C for a single-column index, or C * 0.75 for multiple columns.
4443          * (The idea here is that multiple columns dilute the importance of
4444          * the first column's ordering, but don't negate it entirely.  Before
4445          * 8.0 we divided the correlation by the number of columns, but that
4446          * seems too strong.)
4447          */
4448         if (index->indexkeys[0] != 0)
4449         {
4450                 /* Simple variable --- look to stats for the underlying table */
4451                 relid = getrelid(index->rel->relid, root->parse->rtable);
4452                 Assert(relid != InvalidOid);
4453                 colnum = index->indexkeys[0];
4454         }
4455         else
4456         {
4457                 /* Expression --- maybe there are stats for the index itself */
4458                 relid = index->indexoid;
4459                 colnum = 1;
4460         }
4461
4462         tuple = SearchSysCache(STATRELATT,
4463                                                    ObjectIdGetDatum(relid),
4464                                                    Int16GetDatum(colnum),
4465                                                    0, 0);
4466
4467         if (HeapTupleIsValid(tuple))
4468         {
4469                 Oid                     typid;
4470                 int32           typmod;
4471                 float4     *numbers;
4472                 int                     nnumbers;
4473
4474                 /* XXX this code would break with different storage type */
4475                 get_atttypetypmod(relid, colnum, &typid, &typmod);
4476
4477                 if (get_attstatsslot(tuple, typid, typmod,
4478                                                          STATISTIC_KIND_CORRELATION,
4479                                                          index->ordering[0],
4480                                                          NULL, NULL, &numbers, &nnumbers))
4481                 {
4482                         double          varCorrelation;
4483
4484                         Assert(nnumbers == 1);
4485                         varCorrelation = numbers[0];
4486
4487                         if (index->ncolumns > 1)
4488                                 *indexCorrelation = varCorrelation * 0.75;
4489                         else
4490                                 *indexCorrelation = varCorrelation;
4491
4492                         free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
4493                 }
4494                 ReleaseSysCache(tuple);
4495         }
4496
4497         PG_RETURN_VOID();
4498 }
4499
4500 Datum
4501 rtcostestimate(PG_FUNCTION_ARGS)
4502 {
4503         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4504         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4505         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4506         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4507         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4508         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4509         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4510
4511         genericcostestimate(root, index, indexQuals, 0.0,
4512                                                 indexStartupCost, indexTotalCost,
4513                                                 indexSelectivity, indexCorrelation);
4514
4515         PG_RETURN_VOID();
4516 }
4517
4518 Datum
4519 hashcostestimate(PG_FUNCTION_ARGS)
4520 {
4521         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4522         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4523         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4524         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4525         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4526         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4527         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4528
4529         genericcostestimate(root, index, indexQuals, 0.0,
4530                                                 indexStartupCost, indexTotalCost,
4531                                                 indexSelectivity, indexCorrelation);
4532
4533         PG_RETURN_VOID();
4534 }
4535
4536 Datum
4537 gistcostestimate(PG_FUNCTION_ARGS)
4538 {
4539         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4540         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4541         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4542         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4543         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4544         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4545         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4546
4547         genericcostestimate(root, index, indexQuals, 0.0,
4548                                                 indexStartupCost, indexTotalCost,
4549                                                 indexSelectivity, indexCorrelation);
4550
4551         PG_RETURN_VOID();
4552 }