granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * selfuncs.c
   4  *        Selectivity functions and index cost estimation functions for
   5  *        standard operators and index access methods.
   6  *
   7  *        Selectivity routines are registered in the pg_operator catalog
   8  *        in the "oprrest" and "oprjoin" attributes.
   9  *
  10  *        Index cost functions are registered in the pg_am catalog
  11  *        in the "amcostestimate" attribute.
  12  *
  13  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  14  * Portions Copyright (c) 1994, Regents of the University of California
  15  *
  16  *
  17  * IDENTIFICATION
  18  *        $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.180 2005/06/05 22:32:57 tgl Exp $
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22
  23 /*----------
  24  * Operator selectivity estimation functions are called to estimate the
  25  * selectivity of WHERE clauses whose top-level operator is their operator.
  26  * We divide the problem into two cases:
  27  *              Restriction clause estimation: the clause involves vars of just
  28  *                      one relation.
  29  *              Join clause estimation: the clause involves vars of multiple rels.
  30  * Join selectivity estimation is far more difficult and usually less accurate
  31  * than restriction estimation.
  32  *
  33  * When dealing with the inner scan of a nestloop join, we consider the
  34  * join's joinclauses as restriction clauses for the inner relation, and
  35  * treat vars of the outer relation as parameters (a/k/a constants of unknown
  36  * values).  So, restriction estimators need to be able to accept an argument
  37  * telling which relation is to be treated as the variable.
  38  *
  39  * The call convention for a restriction estimator (oprrest function) is
  40  *
  41  *              Selectivity oprrest (PlannerInfo *root,
  42  *                                                       Oid operator,
  43  *                                                       List *args,
  44  *                                                       int varRelid);
  45  *
  46  * root: general information about the query (rtable and RelOptInfo lists
  47  * are particularly important for the estimator).
  48  * operator: OID of the specific operator in question.
  49  * args: argument list from the operator clause.
  50  * varRelid: if not zero, the relid (rtable index) of the relation to
  51  * be treated as the variable relation.  May be zero if the args list
  52  * is known to contain vars of only one relation.
  53  *
  54  * This is represented at the SQL level (in pg_proc) as
  55  *
  56  *              float8 oprrest (internal, oid, internal, int4);
  57  *
  58  * The call convention for a join estimator (oprjoin function) is similar
  59  * except that varRelid is not needed, and instead the join type is
  60  * supplied:
  61  *
  62  *              Selectivity oprjoin (PlannerInfo *root,
  63  *                                                       Oid operator,
  64  *                                                       List *args,
  65  *                                                       JoinType jointype);
  66  *
  67  *              float8 oprjoin (internal, oid, internal, int2);
  68  *
  69  * (We deliberately make the SQL signature different to facilitate
  70  * catching errors.)
  71  *----------
  72  */
  73
  74 #include "postgres.h"
  75
  76 #include <ctype.h>
  77 #include <math.h>
  78
  79 #include "access/heapam.h"
  80 #include "access/nbtree.h"
  81 #include "access/tuptoaster.h"
  82 #include "catalog/pg_namespace.h"
  83 #include "catalog/pg_opclass.h"
  84 #include "catalog/pg_operator.h"
  85 #include "catalog/pg_proc.h"
  86 #include "catalog/pg_statistic.h"
  87 #include "catalog/pg_type.h"
  88 #include "mb/pg_wchar.h"
  89 #include "nodes/makefuncs.h"
  90 #include "optimizer/clauses.h"
  91 #include "optimizer/cost.h"
  92 #include "optimizer/pathnode.h"
  93 #include "optimizer/paths.h"
  94 #include "optimizer/plancat.h"
  95 #include "optimizer/prep.h"
  96 #include "optimizer/restrictinfo.h"
  97 #include "optimizer/tlist.h"
  98 #include "optimizer/var.h"
  99 #include "parser/parse_expr.h"
 100 #include "parser/parse_func.h"
 101 #include "parser/parse_oper.h"
 102 #include "parser/parsetree.h"
 103 #include "utils/builtins.h"
 104 #include "utils/date.h"
 105 #include "utils/datum.h"
 106 #include "utils/int8.h"
 107 #include "utils/lsyscache.h"
 108 #include "utils/pg_locale.h"
 109 #include "utils/selfuncs.h"
 110 #include "utils/syscache.h"
 111
 112
 113 /* Return data from examine_variable and friends */
 114 typedef struct
 115 {
 116         Node       *var;                        /* the Var or expression tree */
 117         RelOptInfo *rel;                        /* Relation, or NULL if not identifiable */
 118         HeapTuple       statsTuple;             /* pg_statistic tuple, or NULL if none */
 119         /* NB: if statsTuple!=NULL, it must be freed when caller is done */
 120         Oid                     vartype;                /* exposed type of expression */
 121         Oid                     atttype;                /* type to pass to get_attstatsslot */
 122         int32           atttypmod;              /* typmod to pass to get_attstatsslot */
 123         bool            isunique;               /* true if matched to a unique index */
 124 } VariableStatData;
 125
 126 #define ReleaseVariableStats(vardata)  \
 127         do { \
 128                 if (HeapTupleIsValid((vardata).statsTuple)) \
 129                         ReleaseSysCache((vardata).statsTuple); \
 130         } while(0)
 131
 132
 133 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 134                                   Datum lobound, Datum hibound, Oid boundstypid,
 135                                   double *scaledlobound, double *scaledhibound);
 136 static double convert_numeric_to_scalar(Datum value, Oid typid);
 137 static void convert_string_to_scalar(unsigned char *value,
 138                                                  double *scaledvalue,
 139                                                  unsigned char *lobound,
 140                                                  double *scaledlobound,
 141                                                  unsigned char *hibound,
 142                                                  double *scaledhibound);
 143 static void convert_bytea_to_scalar(Datum value,
 144                                                 double *scaledvalue,
 145                                                 Datum lobound,
 146                                                 double *scaledlobound,
 147                                                 Datum hibound,
 148                                                 double *scaledhibound);
 149 static double convert_one_string_to_scalar(unsigned char *value,
 150                                                          int rangelo, int rangehi);
 151 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
 152                                                         int rangelo, int rangehi);
 153 static unsigned char *convert_string_datum(Datum value, Oid typid);
 154 static double convert_timevalue_to_scalar(Datum value, Oid typid);
 155 static bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
 156                                                  VariableStatData *vardata, Node **other,
 157                                                  bool *varonleft);
 158 static void get_join_variables(PlannerInfo *root, List *args,
 159                                    VariableStatData *vardata1,
 160                                    VariableStatData *vardata2);
 161 static void examine_variable(PlannerInfo *root, Node *node, int varRelid,
 162                                  VariableStatData *vardata);
 163 static double get_variable_numdistinct(VariableStatData *vardata);
 164 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
 165                                          Oid sortop, Datum *max);
 166 static Selectivity prefix_selectivity(PlannerInfo *root, Node *variable,
 167                                    Oid opclass, Const *prefix);
 168 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
 169 static Datum string_to_datum(const char *str, Oid datatype);
 170 static Const *string_to_const(const char *str, Oid datatype);
 171 static Const *string_to_bytea_const(const char *str, size_t str_len);
 172
 173
 174 /*
 175  *              eqsel                   - Selectivity of "=" for any data types.
 176  *
 177  * Note: this routine is also used to estimate selectivity for some
 178  * operators that are not "=" but have comparable selectivity behavior,
 179  * such as "~=" (geometric approximate-match).  Even for "=", we must
 180  * keep in mind that the left and right datatypes may differ.
 181  */
 182 Datum
 183 eqsel(PG_FUNCTION_ARGS)
 184 {
 185         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 186         Oid                     operator = PG_GETARG_OID(1);
 187         List       *args = (List *) PG_GETARG_POINTER(2);
 188         int                     varRelid = PG_GETARG_INT32(3);
 189         VariableStatData vardata;
 190         Node       *other;
 191         bool            varonleft;
 192         Datum      *values;
 193         int                     nvalues;
 194         float4     *numbers;
 195         int                     nnumbers;
 196         double          selec;
 197
 198         /*
 199          * If expression is not variable = something or something = variable,
 200          * then punt and return a default estimate.
 201          */
 202         if (!get_restriction_variable(root, args, varRelid,
 203                                                                   &vardata, &other, &varonleft))
 204                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 205
 206         /*
 207          * If the something is a NULL constant, assume operator is strict and
 208          * return zero, ie, operator will never return TRUE.
 209          */
 210         if (IsA(other, Const) &&
 211                 ((Const *) other)->constisnull)
 212         {
 213                 ReleaseVariableStats(vardata);
 214                 PG_RETURN_FLOAT8(0.0);
 215         }
 216
 217         if (HeapTupleIsValid(vardata.statsTuple))
 218         {
 219                 Form_pg_statistic stats;
 220
 221                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 222
 223                 if (IsA(other, Const))
 224                 {
 225                         /* Variable is being compared to a known non-null constant */
 226                         Datum           constval = ((Const *) other)->constvalue;
 227                         bool            match = false;
 228                         int                     i;
 229
 230                         /*
 231                          * Is the constant "=" to any of the column's most common
 232                          * values?      (Although the given operator may not really be
 233                          * "=", we will assume that seeing whether it returns TRUE is
 234                          * an appropriate test.  If you don't like this, maybe you
 235                          * shouldn't be using eqsel for your operator...)
 236                          */
 237                         if (get_attstatsslot(vardata.statsTuple,
 238                                                                  vardata.atttype, vardata.atttypmod,
 239                                                                  STATISTIC_KIND_MCV, InvalidOid,
 240                                                                  &values, &nvalues,
 241                                                                  &numbers, &nnumbers))
 242                         {
 243                                 FmgrInfo        eqproc;
 244
 245                                 fmgr_info(get_opcode(operator), &eqproc);
 246
 247                                 for (i = 0; i < nvalues; i++)
 248                                 {
 249                                         /* be careful to apply operator right way 'round */
 250                                         if (varonleft)
 251                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 252                                                                                                                    values[i],
 253                                                                                                                    constval));
 254                                         else
 255                                                 match = DatumGetBool(FunctionCall2(&eqproc,
 256                                                                                                                    constval,
 257                                                                                                                    values[i]));
 258                                         if (match)
 259                                                 break;
 260                                 }
 261                         }
 262                         else
 263                         {
 264                                 /* no most-common-value info available */
 265                                 values = NULL;
 266                                 numbers = NULL;
 267                                 i = nvalues = nnumbers = 0;
 268                         }
 269
 270                         if (match)
 271                         {
 272                                 /*
 273                                  * Constant is "=" to this common value.  We know
 274                                  * selectivity exactly (or as exactly as VACUUM could
 275                                  * calculate it, anyway).
 276                                  */
 277                                 selec = numbers[i];
 278                         }
 279                         else
 280                         {
 281                                 /*
 282                                  * Comparison is against a constant that is neither NULL
 283                                  * nor any of the common values.  Its selectivity cannot
 284                                  * be more than this:
 285                                  */
 286                                 double          sumcommon = 0.0;
 287                                 double          otherdistinct;
 288
 289                                 for (i = 0; i < nnumbers; i++)
 290                                         sumcommon += numbers[i];
 291                                 selec = 1.0 - sumcommon - stats->stanullfrac;
 292                                 CLAMP_PROBABILITY(selec);
 293
 294                                 /*
 295                                  * and in fact it's probably a good deal less. We
 296                                  * approximate that all the not-common values share this
 297                                  * remaining fraction equally, so we divide by the number
 298                                  * of other distinct values.
 299                                  */
 300                                 otherdistinct = get_variable_numdistinct(&vardata)
 301                                         - nnumbers;
 302                                 if (otherdistinct > 1)
 303                                         selec /= otherdistinct;
 304
 305                                 /*
 306                                  * Another cross-check: selectivity shouldn't be estimated
 307                                  * as more than the least common "most common value".
 308                                  */
 309                                 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
 310                                         selec = numbers[nnumbers - 1];
 311                         }
 312
 313                         free_attstatsslot(vardata.atttype, values, nvalues,
 314                                                           numbers, nnumbers);
 315                 }
 316                 else
 317                 {
 318                         double          ndistinct;
 319
 320                         /*
 321                          * Search is for a value that we do not know a priori, but we
 322                          * will assume it is not NULL.  Estimate the selectivity as
 323                          * non-null fraction divided by number of distinct values, so
 324                          * that we get a result averaged over all possible values
 325                          * whether common or uncommon.  (Essentially, we are assuming
 326                          * that the not-yet-known comparison value is equally likely
 327                          * to be any of the possible values, regardless of their
 328                          * frequency in the table.      Is that a good idea?)
 329                          */
 330                         selec = 1.0 - stats->stanullfrac;
 331                         ndistinct = get_variable_numdistinct(&vardata);
 332                         if (ndistinct > 1)
 333                                 selec /= ndistinct;
 334
 335                         /*
 336                          * Cross-check: selectivity should never be estimated as more
 337                          * than the most common value's.
 338                          */
 339                         if (get_attstatsslot(vardata.statsTuple,
 340                                                                  vardata.atttype, vardata.atttypmod,
 341                                                                  STATISTIC_KIND_MCV, InvalidOid,
 342                                                                  NULL, NULL,
 343                                                                  &numbers, &nnumbers))
 344                         {
 345                                 if (nnumbers > 0 && selec > numbers[0])
 346                                         selec = numbers[0];
 347                                 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
 348                         }
 349                 }
 350         }
 351         else
 352         {
 353                 /*
 354                  * No VACUUM ANALYZE stats available, so make a guess using
 355                  * estimated number of distinct values and assuming they are
 356                  * equally common.      (The guess is unlikely to be very good, but we
 357                  * do know a few special cases.)
 358                  */
 359                 selec = 1.0 / get_variable_numdistinct(&vardata);
 360         }
 361
 362         ReleaseVariableStats(vardata);
 363
 364         /* result should be in range, but make sure... */
 365         CLAMP_PROBABILITY(selec);
 366
 367         PG_RETURN_FLOAT8((float8) selec);
 368 }
 369
 370 /*
 371  *              neqsel                  - Selectivity of "!=" for any data types.
 372  *
 373  * This routine is also used for some operators that are not "!="
 374  * but have comparable selectivity behavior.  See above comments
 375  * for eqsel().
 376  */
 377 Datum
 378 neqsel(PG_FUNCTION_ARGS)
 379 {
 380         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 381         Oid                     operator = PG_GETARG_OID(1);
 382         List       *args = (List *) PG_GETARG_POINTER(2);
 383         int                     varRelid = PG_GETARG_INT32(3);
 384         Oid                     eqop;
 385         float8          result;
 386
 387         /*
 388          * We want 1 - eqsel() where the equality operator is the one
 389          * associated with this != operator, that is, its negator.
 390          */
 391         eqop = get_negator(operator);
 392         if (eqop)
 393         {
 394                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 395                                                                                                         PointerGetDatum(root),
 396                                                                                                   ObjectIdGetDatum(eqop),
 397                                                                                                         PointerGetDatum(args),
 398                                                                                            Int32GetDatum(varRelid)));
 399         }
 400         else
 401         {
 402                 /* Use default selectivity (should we raise an error instead?) */
 403                 result = DEFAULT_EQ_SEL;
 404         }
 405         result = 1.0 - result;
 406         PG_RETURN_FLOAT8(result);
 407 }
 408
 409 /*
 410  *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
 411  *
 412  * This is the guts of both scalarltsel and scalargtsel.  The caller has
 413  * commuted the clause, if necessary, so that we can treat the variable as
 414  * being on the left.  The caller must also make sure that the other side
 415  * of the clause is a non-null Const, and dissect same into a value and
 416  * datatype.
 417  *
 418  * This routine works for any datatype (or pair of datatypes) known to
 419  * convert_to_scalar().  If it is applied to some other datatype,
 420  * it will return a default estimate.
 421  */
 422 static double
 423 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 424                           VariableStatData *vardata, Datum constval, Oid consttype)
 425 {
 426         Form_pg_statistic stats;
 427         FmgrInfo        opproc;
 428         Datum      *values;
 429         int                     nvalues;
 430         float4     *numbers;
 431         int                     nnumbers;
 432         double          mcv_selec,
 433                                 hist_selec,
 434                                 sumcommon;
 435         double          selec;
 436         int                     i;
 437
 438         if (!HeapTupleIsValid(vardata->statsTuple))
 439         {
 440                 /* no stats available, so default result */
 441                 return DEFAULT_INEQ_SEL;
 442         }
 443         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 444
 445         fmgr_info(get_opcode(operator), &opproc);
 446
 447         /*
 448          * If we have most-common-values info, add up the fractions of the MCV
 449          * entries that satisfy MCV OP CONST.  These fractions contribute
 450          * directly to the result selectivity.  Also add up the total fraction
 451          * represented by MCV entries.
 452          */
 453         mcv_selec = 0.0;
 454         sumcommon = 0.0;
 455
 456         if (get_attstatsslot(vardata->statsTuple,
 457                                                  vardata->atttype, vardata->atttypmod,
 458                                                  STATISTIC_KIND_MCV, InvalidOid,
 459                                                  &values, &nvalues,
 460                                                  &numbers, &nnumbers))
 461         {
 462                 for (i = 0; i < nvalues; i++)
 463                 {
 464                         if (DatumGetBool(FunctionCall2(&opproc,
 465                                                                                    values[i],
 466                                                                                    constval)))
 467                                 mcv_selec += numbers[i];
 468                         sumcommon += numbers[i];
 469                 }
 470                 free_attstatsslot(vardata->atttype, values, nvalues,
 471                                                   numbers, nnumbers);
 472         }
 473
 474         /*
 475          * If there is a histogram, determine which bin the constant falls in,
 476          * and compute the resulting contribution to selectivity.
 477          *
 478          * Someday, VACUUM might store more than one histogram per rel/att,
 479          * corresponding to more than one possible sort ordering defined for
 480          * the column type.  However, to make that work we will need to figure
 481          * out which staop to search for --- it's not necessarily the one we
 482          * have at hand!  (For example, we might have a '<=' operator rather
 483          * than the '<' operator that will appear in staop.)  For now, assume
 484          * that whatever appears in pg_statistic is sorted the same way our
 485          * operator sorts, or the reverse way if isgt is TRUE.
 486          */
 487         hist_selec = 0.0;
 488
 489         if (get_attstatsslot(vardata->statsTuple,
 490                                                  vardata->atttype, vardata->atttypmod,
 491                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
 492                                                  &values, &nvalues,
 493                                                  NULL, NULL))
 494         {
 495                 if (nvalues > 1)
 496                 {
 497                         double          histfrac;
 498                         bool            ltcmp;
 499
 500                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 501                                                                                            values[0],
 502                                                                                            constval));
 503                         if (isgt)
 504                                 ltcmp = !ltcmp;
 505                         if (!ltcmp)
 506                         {
 507                                 /* Constant is below lower histogram boundary. */
 508                                 histfrac = 0.0;
 509                         }
 510                         else
 511                         {
 512                                 /*
 513                                  * Scan to find proper location.  This could be made
 514                                  * faster by using a binary-search method, but it's
 515                                  * probably not worth the trouble for typical histogram
 516                                  * sizes.
 517                                  */
 518                                 for (i = 1; i < nvalues; i++)
 519                                 {
 520                                         ltcmp = DatumGetBool(FunctionCall2(&opproc,
 521                                                                                                            values[i],
 522                                                                                                            constval));
 523                                         if (isgt)
 524                                                 ltcmp = !ltcmp;
 525                                         if (!ltcmp)
 526                                                 break;
 527                                 }
 528                                 if (i >= nvalues)
 529                                 {
 530                                         /* Constant is above upper histogram boundary. */
 531                                         histfrac = 1.0;
 532                                 }
 533                                 else
 534                                 {
 535                                         double          val,
 536                                                                 high,
 537                                                                 low;
 538                                         double          binfrac;
 539
 540                                         /*
 541                                          * We have values[i-1] < constant < values[i].
 542                                          *
 543                                          * Convert the constant and the two nearest bin boundary
 544                                          * values to a uniform comparison scale, and do a
 545                                          * linear interpolation within this bin.
 546                                          */
 547                                         if (convert_to_scalar(constval, consttype, &val,
 548                                                                                   values[i - 1], values[i],
 549                                                                                   vardata->vartype,
 550                                                                                   &low, &high))
 551                                         {
 552                                                 if (high <= low)
 553                                                 {
 554                                                         /* cope if bin boundaries appear identical */
 555                                                         binfrac = 0.5;
 556                                                 }
 557                                                 else if (val <= low)
 558                                                         binfrac = 0.0;
 559                                                 else if (val >= high)
 560                                                         binfrac = 1.0;
 561                                                 else
 562                                                 {
 563                                                         binfrac = (val - low) / (high - low);
 564
 565                                                         /*
 566                                                          * Watch out for the possibility that we got a
 567                                                          * NaN or Infinity from the division.  This
 568                                                          * can happen despite the previous checks, if
 569                                                          * for example "low" is -Infinity.
 570                                                          */
 571                                                         if (isnan(binfrac) ||
 572                                                                 binfrac < 0.0 || binfrac > 1.0)
 573                                                                 binfrac = 0.5;
 574                                                 }
 575                                         }
 576                                         else
 577                                         {
 578                                                 /*
 579                                                  * Ideally we'd produce an error here, on the
 580                                                  * grounds that the given operator shouldn't have
 581                                                  * scalarXXsel registered as its selectivity func
 582                                                  * unless we can deal with its operand types.  But
 583                                                  * currently, all manner of stuff is invoking
 584                                                  * scalarXXsel, so give a default estimate until
 585                                                  * that can be fixed.
 586                                                  */
 587                                                 binfrac = 0.5;
 588                                         }
 589
 590                                         /*
 591                                          * Now, compute the overall selectivity across the
 592                                          * values represented by the histogram.  We have i-1
 593                                          * full bins and binfrac partial bin below the
 594                                          * constant.
 595                                          */
 596                                         histfrac = (double) (i - 1) + binfrac;
 597                                         histfrac /= (double) (nvalues - 1);
 598                                 }
 599                         }
 600
 601                         /*
 602                          * Now histfrac = fraction of histogram entries below the
 603                          * constant.
 604                          *
 605                          * Account for "<" vs ">"
 606                          */
 607                         hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 608
 609                         /*
 610                          * The histogram boundaries are only approximate to begin
 611                          * with, and may well be out of date anyway.  Therefore, don't
 612                          * believe extremely small or large selectivity estimates.
 613                          */
 614                         if (hist_selec < 0.0001)
 615                                 hist_selec = 0.0001;
 616                         else if (hist_selec > 0.9999)
 617                                 hist_selec = 0.9999;
 618                 }
 619
 620                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 621         }
 622
 623         /*
 624          * Now merge the results from the MCV and histogram calculations,
 625          * realizing that the histogram covers only the non-null values that
 626          * are not listed in MCV.
 627          */
 628         selec = 1.0 - stats->stanullfrac - sumcommon;
 629
 630         if (hist_selec > 0.0)
 631                 selec *= hist_selec;
 632         else
 633         {
 634                 /*
 635                  * If no histogram but there are values not accounted for by MCV,
 636                  * arbitrarily assume half of them will match.
 637                  */
 638                 selec *= 0.5;
 639         }
 640
 641         selec += mcv_selec;
 642
 643         /* result should be in range, but make sure... */
 644         CLAMP_PROBABILITY(selec);
 645
 646         return selec;
 647 }
 648
 649 /*
 650  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 651  */
 652 Datum
 653 scalarltsel(PG_FUNCTION_ARGS)
 654 {
 655         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 656         Oid                     operator = PG_GETARG_OID(1);
 657         List       *args = (List *) PG_GETARG_POINTER(2);
 658         int                     varRelid = PG_GETARG_INT32(3);
 659         VariableStatData vardata;
 660         Node       *other;
 661         bool            varonleft;
 662         Datum           constval;
 663         Oid                     consttype;
 664         bool            isgt;
 665         double          selec;
 666
 667         /*
 668          * If expression is not variable op something or something op
 669          * variable, then punt and return a default estimate.
 670          */
 671         if (!get_restriction_variable(root, args, varRelid,
 672                                                                   &vardata, &other, &varonleft))
 673                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 674
 675         /*
 676          * Can't do anything useful if the something is not a constant,
 677          * either.
 678          */
 679         if (!IsA(other, Const))
 680         {
 681                 ReleaseVariableStats(vardata);
 682                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 683         }
 684
 685         /*
 686          * If the constant is NULL, assume operator is strict and return zero,
 687          * ie, operator will never return TRUE.
 688          */
 689         if (((Const *) other)->constisnull)
 690         {
 691                 ReleaseVariableStats(vardata);
 692                 PG_RETURN_FLOAT8(0.0);
 693         }
 694         constval = ((Const *) other)->constvalue;
 695         consttype = ((Const *) other)->consttype;
 696
 697         /*
 698          * Force the var to be on the left to simplify logic in scalarineqsel.
 699          */
 700         if (varonleft)
 701         {
 702                 /* we have var < other */
 703                 isgt = false;
 704         }
 705         else
 706         {
 707                 /* we have other < var, commute to make var > other */
 708                 operator = get_commutator(operator);
 709                 if (!operator)
 710                 {
 711                         /* Use default selectivity (should we raise an error instead?) */
 712                         ReleaseVariableStats(vardata);
 713                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 714                 }
 715                 isgt = true;
 716         }
 717
 718         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 719
 720         ReleaseVariableStats(vardata);
 721
 722         PG_RETURN_FLOAT8((float8) selec);
 723 }
 724
 725 /*
 726  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 727  */
 728 Datum
 729 scalargtsel(PG_FUNCTION_ARGS)
 730 {
 731         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 732         Oid                     operator = PG_GETARG_OID(1);
 733         List       *args = (List *) PG_GETARG_POINTER(2);
 734         int                     varRelid = PG_GETARG_INT32(3);
 735         VariableStatData vardata;
 736         Node       *other;
 737         bool            varonleft;
 738         Datum           constval;
 739         Oid                     consttype;
 740         bool            isgt;
 741         double          selec;
 742
 743         /*
 744          * If expression is not variable op something or something op
 745          * variable, then punt and return a default estimate.
 746          */
 747         if (!get_restriction_variable(root, args, varRelid,
 748                                                                   &vardata, &other, &varonleft))
 749                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 750
 751         /*
 752          * Can't do anything useful if the something is not a constant,
 753          * either.
 754          */
 755         if (!IsA(other, Const))
 756         {
 757                 ReleaseVariableStats(vardata);
 758                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 759         }
 760
 761         /*
 762          * If the constant is NULL, assume operator is strict and return zero,
 763          * ie, operator will never return TRUE.
 764          */
 765         if (((Const *) other)->constisnull)
 766         {
 767                 ReleaseVariableStats(vardata);
 768                 PG_RETURN_FLOAT8(0.0);
 769         }
 770         constval = ((Const *) other)->constvalue;
 771         consttype = ((Const *) other)->consttype;
 772
 773         /*
 774          * Force the var to be on the left to simplify logic in scalarineqsel.
 775          */
 776         if (varonleft)
 777         {
 778                 /* we have var > other */
 779                 isgt = true;
 780         }
 781         else
 782         {
 783                 /* we have other > var, commute to make var < other */
 784                 operator = get_commutator(operator);
 785                 if (!operator)
 786                 {
 787                         /* Use default selectivity (should we raise an error instead?) */
 788                         ReleaseVariableStats(vardata);
 789                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 790                 }
 791                 isgt = false;
 792         }
 793
 794         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
 795
 796         ReleaseVariableStats(vardata);
 797
 798         PG_RETURN_FLOAT8((float8) selec);
 799 }
 800
 801 /*
 802  * patternsel                   - Generic code for pattern-match selectivity.
 803  */
 804 static double
 805 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 806 {
 807         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 808
 809 #ifdef NOT_USED
 810         Oid                     operator = PG_GETARG_OID(1);
 811 #endif
 812         List       *args = (List *) PG_GETARG_POINTER(2);
 813         int                     varRelid = PG_GETARG_INT32(3);
 814         VariableStatData vardata;
 815         Node       *variable;
 816         Node       *other;
 817         bool            varonleft;
 818         Datum           constval;
 819         Oid                     consttype;
 820         Oid                     vartype;
 821         Oid                     opclass;
 822         Pattern_Prefix_Status pstatus;
 823         Const      *patt = NULL;
 824         Const      *prefix = NULL;
 825         Const      *rest = NULL;
 826         double          result;
 827
 828         /*
 829          * If expression is not variable op constant, then punt and return a
 830          * default estimate.
 831          */
 832         if (!get_restriction_variable(root, args, varRelid,
 833                                                                   &vardata, &other, &varonleft))
 834                 return DEFAULT_MATCH_SEL;
 835         if (!varonleft || !IsA(other, Const))
 836         {
 837                 ReleaseVariableStats(vardata);
 838                 return DEFAULT_MATCH_SEL;
 839         }
 840         variable = (Node *) linitial(args);
 841
 842         /*
 843          * If the constant is NULL, assume operator is strict and return zero,
 844          * ie, operator will never return TRUE.
 845          */
 846         if (((Const *) other)->constisnull)
 847         {
 848                 ReleaseVariableStats(vardata);
 849                 return 0.0;
 850         }
 851         constval = ((Const *) other)->constvalue;
 852         consttype = ((Const *) other)->consttype;
 853
 854         /*
 855          * The right-hand const is type text or bytea for all supported
 856          * operators.  We do not expect to see binary-compatible types here,
 857          * since const-folding should have relabeled the const to exactly
 858          * match the operator's declared type.
 859          */
 860         if (consttype != TEXTOID && consttype != BYTEAOID)
 861         {
 862                 ReleaseVariableStats(vardata);
 863                 return DEFAULT_MATCH_SEL;
 864         }
 865
 866         /*
 867          * Similarly, the exposed type of the left-hand side should be one
 868          * of those we know.  (Do not look at vardata.atttype, which might be
 869          * something binary-compatible but different.)  We can use it to choose
 870          * the index opclass from which we must draw the comparison operators.
 871          *
 872          * NOTE: It would be more correct to use the PATTERN opclasses than the
 873          * simple ones, but at the moment ANALYZE will not generate statistics
 874          * for the PATTERN operators.  But our results are so approximate
 875          * anyway that it probably hardly matters.
 876          */
 877         vartype = vardata.vartype;
 878
 879         switch (vartype)
 880         {
 881                 case TEXTOID:
 882                         opclass = TEXT_BTREE_OPS_OID;
 883                         break;
 884                 case VARCHAROID:
 885                         opclass = VARCHAR_BTREE_OPS_OID;
 886                         break;
 887                 case BPCHAROID:
 888                         opclass = BPCHAR_BTREE_OPS_OID;
 889                         break;
 890                 case NAMEOID:
 891                         opclass = NAME_BTREE_OPS_OID;
 892                         break;
 893                 case BYTEAOID:
 894                         opclass = BYTEA_BTREE_OPS_OID;
 895                         break;
 896                 default:
 897                         ReleaseVariableStats(vardata);
 898                         return DEFAULT_MATCH_SEL;
 899         }
 900
 901         /* divide pattern into fixed prefix and remainder */
 902         patt = (Const *) other;
 903         pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
 904
 905         /*
 906          * If necessary, coerce the prefix constant to the right type. (The
 907          * "rest" constant need not be changed.)
 908          */
 909         if (prefix && prefix->consttype != vartype)
 910         {
 911                 char       *prefixstr;
 912
 913                 switch (prefix->consttype)
 914                 {
 915                         case TEXTOID:
 916                                 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
 917                                                                                                         prefix->constvalue));
 918                                 break;
 919                         case BYTEAOID:
 920                                 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
 921                                                                                                         prefix->constvalue));
 922                                 break;
 923                         default:
 924                                 elog(ERROR, "unrecognized consttype: %u",
 925                                          prefix->consttype);
 926                                 ReleaseVariableStats(vardata);
 927                                 return DEFAULT_MATCH_SEL;
 928                 }
 929                 prefix = string_to_const(prefixstr, vartype);
 930                 pfree(prefixstr);
 931         }
 932
 933         if (pstatus == Pattern_Prefix_Exact)
 934         {
 935                 /*
 936                  * Pattern specifies an exact match, so pretend operator is '='
 937                  */
 938                 Oid                     eqopr = get_opclass_member(opclass, InvalidOid,
 939                                                                                            BTEqualStrategyNumber);
 940                 List       *eqargs;
 941
 942                 if (eqopr == InvalidOid)
 943                         elog(ERROR, "no = operator for opclass %u", opclass);
 944                 eqargs = list_make2(variable, prefix);
 945                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
 946                                                                                                         PointerGetDatum(root),
 947                                                                                                  ObjectIdGetDatum(eqopr),
 948                                                                                                  PointerGetDatum(eqargs),
 949                                                                                            Int32GetDatum(varRelid)));
 950         }
 951         else
 952         {
 953                 /*
 954                  * Not exact-match pattern.  We estimate selectivity of the fixed
 955                  * prefix and remainder of pattern separately, then combine the
 956                  * two.
 957                  */
 958                 Selectivity prefixsel;
 959                 Selectivity restsel;
 960                 Selectivity selec;
 961
 962                 if (pstatus == Pattern_Prefix_Partial)
 963                         prefixsel = prefix_selectivity(root, variable, opclass, prefix);
 964                 else
 965                         prefixsel = 1.0;
 966                 restsel = pattern_selectivity(rest, ptype);
 967                 selec = prefixsel * restsel;
 968                 /* result should be in range, but make sure... */
 969                 CLAMP_PROBABILITY(selec);
 970                 result = selec;
 971         }
 972
 973         if (prefix)
 974         {
 975                 pfree(DatumGetPointer(prefix->constvalue));
 976                 pfree(prefix);
 977         }
 978
 979         ReleaseVariableStats(vardata);
 980
 981         return result;
 982 }
 983
 984 /*
 985  *              regexeqsel              - Selectivity of regular-expression pattern match.
 986  */
 987 Datum
 988 regexeqsel(PG_FUNCTION_ARGS)
 989 {
 990         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
 991 }
 992
 993 /*
 994  *              icregexeqsel    - Selectivity of case-insensitive regex match.
 995  */
 996 Datum
 997 icregexeqsel(PG_FUNCTION_ARGS)
 998 {
 999         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1000 }
1001
1002 /*
1003  *              likesel                 - Selectivity of LIKE pattern match.
1004  */
1005 Datum
1006 likesel(PG_FUNCTION_ARGS)
1007 {
1008         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1009 }
1010
1011 /*
1012  *              iclikesel                       - Selectivity of ILIKE pattern match.
1013  */
1014 Datum
1015 iclikesel(PG_FUNCTION_ARGS)
1016 {
1017         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1018 }
1019
1020 /*
1021  *              regexnesel              - Selectivity of regular-expression pattern non-match.
1022  */
1023 Datum
1024 regexnesel(PG_FUNCTION_ARGS)
1025 {
1026         double          result;
1027
1028         result = patternsel(fcinfo, Pattern_Type_Regex);
1029         result = 1.0 - result;
1030         PG_RETURN_FLOAT8(result);
1031 }
1032
1033 /*
1034  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
1035  */
1036 Datum
1037 icregexnesel(PG_FUNCTION_ARGS)
1038 {
1039         double          result;
1040
1041         result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1042         result = 1.0 - result;
1043         PG_RETURN_FLOAT8(result);
1044 }
1045
1046 /*
1047  *              nlikesel                - Selectivity of LIKE pattern non-match.
1048  */
1049 Datum
1050 nlikesel(PG_FUNCTION_ARGS)
1051 {
1052         double          result;
1053
1054         result = patternsel(fcinfo, Pattern_Type_Like);
1055         result = 1.0 - result;
1056         PG_RETURN_FLOAT8(result);
1057 }
1058
1059 /*
1060  *              icnlikesel              - Selectivity of ILIKE pattern non-match.
1061  */
1062 Datum
1063 icnlikesel(PG_FUNCTION_ARGS)
1064 {
1065         double          result;
1066
1067         result = patternsel(fcinfo, Pattern_Type_Like_IC);
1068         result = 1.0 - result;
1069         PG_RETURN_FLOAT8(result);
1070 }
1071
1072 /*
1073  *              booltestsel             - Selectivity of BooleanTest Node.
1074  */
1075 Selectivity
1076 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1077                         int varRelid, JoinType jointype)
1078 {
1079         VariableStatData vardata;
1080         double          selec;
1081
1082         examine_variable(root, arg, varRelid, &vardata);
1083
1084         if (HeapTupleIsValid(vardata.statsTuple))
1085         {
1086                 Form_pg_statistic stats;
1087                 double          freq_null;
1088                 Datum      *values;
1089                 int                     nvalues;
1090                 float4     *numbers;
1091                 int                     nnumbers;
1092
1093                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1094                 freq_null = stats->stanullfrac;
1095
1096                 if (get_attstatsslot(vardata.statsTuple,
1097                                                          vardata.atttype, vardata.atttypmod,
1098                                                          STATISTIC_KIND_MCV, InvalidOid,
1099                                                          &values, &nvalues,
1100                                                          &numbers, &nnumbers)
1101                         && nnumbers > 0)
1102                 {
1103                         double          freq_true;
1104                         double          freq_false;
1105
1106                         /*
1107                          * Get first MCV frequency and derive frequency for true.
1108                          */
1109                         if (DatumGetBool(values[0]))
1110                                 freq_true = numbers[0];
1111                         else
1112                                 freq_true = 1.0 - numbers[0] - freq_null;
1113
1114                         /*
1115                          * Next derive frequency for false. Then use these as
1116                          * appropriate to derive frequency for each case.
1117                          */
1118                         freq_false = 1.0 - freq_true - freq_null;
1119
1120                         switch (booltesttype)
1121                         {
1122                                 case IS_UNKNOWN:
1123                                         /* select only NULL values */
1124                                         selec = freq_null;
1125                                         break;
1126                                 case IS_NOT_UNKNOWN:
1127                                         /* select non-NULL values */
1128                                         selec = 1.0 - freq_null;
1129                                         break;
1130                                 case IS_TRUE:
1131                                         /* select only TRUE values */
1132                                         selec = freq_true;
1133                                         break;
1134                                 case IS_NOT_TRUE:
1135                                         /* select non-TRUE values */
1136                                         selec = 1.0 - freq_true;
1137                                         break;
1138                                 case IS_FALSE:
1139                                         /* select only FALSE values */
1140                                         selec = freq_false;
1141                                         break;
1142                                 case IS_NOT_FALSE:
1143                                         /* select non-FALSE values */
1144                                         selec = 1.0 - freq_false;
1145                                         break;
1146                                 default:
1147                                         elog(ERROR, "unrecognized booltesttype: %d",
1148                                                  (int) booltesttype);
1149                                         selec = 0.0;    /* Keep compiler quiet */
1150                                         break;
1151                         }
1152
1153                         free_attstatsslot(vardata.atttype, values, nvalues,
1154                                                           numbers, nnumbers);
1155                 }
1156                 else
1157                 {
1158                         /*
1159                          * No most-common-value info available. Still have null
1160                          * fraction information, so use it for IS [NOT] UNKNOWN.
1161                          * Otherwise adjust for null fraction and assume an even split
1162                          * for boolean tests.
1163                          */
1164                         switch (booltesttype)
1165                         {
1166                                 case IS_UNKNOWN:
1167
1168                                         /*
1169                                          * Use freq_null directly.
1170                                          */
1171                                         selec = freq_null;
1172                                         break;
1173                                 case IS_NOT_UNKNOWN:
1174
1175                                         /*
1176                                          * Select not unknown (not null) values. Calculate
1177                                          * from freq_null.
1178                                          */
1179                                         selec = 1.0 - freq_null;
1180                                         break;
1181                                 case IS_TRUE:
1182                                 case IS_NOT_TRUE:
1183                                 case IS_FALSE:
1184                                 case IS_NOT_FALSE:
1185                                         selec = (1.0 - freq_null) / 2.0;
1186                                         break;
1187                                 default:
1188                                         elog(ERROR, "unrecognized booltesttype: %d",
1189                                                  (int) booltesttype);
1190                                         selec = 0.0;    /* Keep compiler quiet */
1191                                         break;
1192                         }
1193                 }
1194         }
1195         else
1196         {
1197                 /*
1198                  * If we can't get variable statistics for the argument, perhaps
1199                  * clause_selectivity can do something with it.  We ignore the
1200                  * possibility of a NULL value when using clause_selectivity, and
1201                  * just assume the value is either TRUE or FALSE.
1202                  */
1203                 switch (booltesttype)
1204                 {
1205                         case IS_UNKNOWN:
1206                                 selec = DEFAULT_UNK_SEL;
1207                                 break;
1208                         case IS_NOT_UNKNOWN:
1209                                 selec = DEFAULT_NOT_UNK_SEL;
1210                                 break;
1211                         case IS_TRUE:
1212                         case IS_NOT_FALSE:
1213                                 selec = (double) clause_selectivity(root, arg,
1214                                                                                                         varRelid, jointype);
1215                                 break;
1216                         case IS_FALSE:
1217                         case IS_NOT_TRUE:
1218                                 selec = 1.0 - (double) clause_selectivity(root, arg,
1219                                                                                                          varRelid, jointype);
1220                                 break;
1221                         default:
1222                                 elog(ERROR, "unrecognized booltesttype: %d",
1223                                          (int) booltesttype);
1224                                 selec = 0.0;    /* Keep compiler quiet */
1225                                 break;
1226                 }
1227         }
1228
1229         ReleaseVariableStats(vardata);
1230
1231         /* result should be in range, but make sure... */
1232         CLAMP_PROBABILITY(selec);
1233
1234         return (Selectivity) selec;
1235 }
1236
1237 /*
1238  *              nulltestsel             - Selectivity of NullTest Node.
1239  */
1240 Selectivity
1241 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1242                         Node *arg, int varRelid)
1243 {
1244         VariableStatData vardata;
1245         double          selec;
1246
1247         examine_variable(root, arg, varRelid, &vardata);
1248
1249         if (HeapTupleIsValid(vardata.statsTuple))
1250         {
1251                 Form_pg_statistic stats;
1252                 double          freq_null;
1253
1254                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1255                 freq_null = stats->stanullfrac;
1256
1257                 switch (nulltesttype)
1258                 {
1259                         case IS_NULL:
1260
1261                                 /*
1262                                  * Use freq_null directly.
1263                                  */
1264                                 selec = freq_null;
1265                                 break;
1266                         case IS_NOT_NULL:
1267
1268                                 /*
1269                                  * Select not unknown (not null) values. Calculate from
1270                                  * freq_null.
1271                                  */
1272                                 selec = 1.0 - freq_null;
1273                                 break;
1274                         default:
1275                                 elog(ERROR, "unrecognized nulltesttype: %d",
1276                                          (int) nulltesttype);
1277                                 return (Selectivity) 0; /* keep compiler quiet */
1278                 }
1279         }
1280         else
1281         {
1282                 /*
1283                  * No VACUUM ANALYZE stats available, so make a guess
1284                  */
1285                 switch (nulltesttype)
1286                 {
1287                         case IS_NULL:
1288                                 selec = DEFAULT_UNK_SEL;
1289                                 break;
1290                         case IS_NOT_NULL:
1291                                 selec = DEFAULT_NOT_UNK_SEL;
1292                                 break;
1293                         default:
1294                                 elog(ERROR, "unrecognized nulltesttype: %d",
1295                                          (int) nulltesttype);
1296                                 return (Selectivity) 0; /* keep compiler quiet */
1297                 }
1298         }
1299
1300         ReleaseVariableStats(vardata);
1301
1302         /* result should be in range, but make sure... */
1303         CLAMP_PROBABILITY(selec);
1304
1305         return (Selectivity) selec;
1306 }
1307
1308 /*
1309  *              eqjoinsel               - Join selectivity of "="
1310  */
1311 Datum
1312 eqjoinsel(PG_FUNCTION_ARGS)
1313 {
1314         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1315         Oid                     operator = PG_GETARG_OID(1);
1316         List       *args = (List *) PG_GETARG_POINTER(2);
1317         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1318         double          selec;
1319         VariableStatData vardata1;
1320         VariableStatData vardata2;
1321         double          nd1;
1322         double          nd2;
1323         Form_pg_statistic stats1 = NULL;
1324         Form_pg_statistic stats2 = NULL;
1325         bool            have_mcvs1 = false;
1326         Datum      *values1 = NULL;
1327         int                     nvalues1 = 0;
1328         float4     *numbers1 = NULL;
1329         int                     nnumbers1 = 0;
1330         bool            have_mcvs2 = false;
1331         Datum      *values2 = NULL;
1332         int                     nvalues2 = 0;
1333         float4     *numbers2 = NULL;
1334         int                     nnumbers2 = 0;
1335
1336         get_join_variables(root, args, &vardata1, &vardata2);
1337
1338         nd1 = get_variable_numdistinct(&vardata1);
1339         nd2 = get_variable_numdistinct(&vardata2);
1340
1341         if (HeapTupleIsValid(vardata1.statsTuple))
1342         {
1343                 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1344                 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1345                                                                           vardata1.atttype,
1346                                                                           vardata1.atttypmod,
1347                                                                           STATISTIC_KIND_MCV,
1348                                                                           InvalidOid,
1349                                                                           &values1, &nvalues1,
1350                                                                           &numbers1, &nnumbers1);
1351         }
1352
1353         if (HeapTupleIsValid(vardata2.statsTuple))
1354         {
1355                 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1356                 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1357                                                                           vardata2.atttype,
1358                                                                           vardata2.atttypmod,
1359                                                                           STATISTIC_KIND_MCV,
1360                                                                           InvalidOid,
1361                                                                           &values2, &nvalues2,
1362                                                                           &numbers2, &nnumbers2);
1363         }
1364
1365         if (have_mcvs1 && have_mcvs2)
1366         {
1367                 /*
1368                  * We have most-common-value lists for both relations.  Run
1369                  * through the lists to see which MCVs actually join to each other
1370                  * with the given operator.  This allows us to determine the exact
1371                  * join selectivity for the portion of the relations represented
1372                  * by the MCV lists.  We still have to estimate for the remaining
1373                  * population, but in a skewed distribution this gives us a big
1374                  * leg up in accuracy.  For motivation see the analysis in Y.
1375                  * Ioannidis and S. Christodoulakis, "On the propagation of errors
1376                  * in the size of join results", Technical Report 1018, Computer
1377                  * Science Dept., University of Wisconsin, Madison, March 1991
1378                  * (available from ftp.cs.wisc.edu).
1379                  */
1380                 FmgrInfo        eqproc;
1381                 bool       *hasmatch1;
1382                 bool       *hasmatch2;
1383                 double          nullfrac1 = stats1->stanullfrac;
1384                 double          nullfrac2 = stats2->stanullfrac;
1385                 double          matchprodfreq,
1386                                         matchfreq1,
1387                                         matchfreq2,
1388                                         unmatchfreq1,
1389                                         unmatchfreq2,
1390                                         otherfreq1,
1391                                         otherfreq2,
1392                                         totalsel1,
1393                                         totalsel2;
1394                 int                     i,
1395                                         nmatches;
1396
1397                 fmgr_info(get_opcode(operator), &eqproc);
1398                 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1399                 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1400
1401                 /*
1402                  * If we are doing any variant of JOIN_IN, pretend all the values
1403                  * of the righthand relation are unique (ie, act as if it's been
1404                  * DISTINCT'd).
1405                  *
1406                  * NOTE: it might seem that we should unique-ify the lefthand input
1407                  * when considering JOIN_REVERSE_IN.  But this is not so, because
1408                  * the join clause we've been handed has not been commuted from
1409                  * the way the parser originally wrote it.      We know that the
1410                  * unique side of the IN clause is *always* on the right.
1411                  *
1412                  * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1413                  * JOIN_RIGHT here, because we do not have enough information to
1414                  * determine which var is really on which side of the join.
1415                  * Perhaps someday we should pass in more information.
1416                  */
1417                 if (jointype == JOIN_IN ||
1418                         jointype == JOIN_REVERSE_IN ||
1419                         jointype == JOIN_UNIQUE_INNER ||
1420                         jointype == JOIN_UNIQUE_OUTER)
1421                 {
1422                         float4          oneovern = 1.0 / nd2;
1423
1424                         for (i = 0; i < nvalues2; i++)
1425                                 numbers2[i] = oneovern;
1426                         nullfrac2 = oneovern;
1427                 }
1428
1429                 /*
1430                  * Note we assume that each MCV will match at most one member of
1431                  * the other MCV list.  If the operator isn't really equality,
1432                  * there could be multiple matches --- but we don't look for them,
1433                  * both for speed and because the math wouldn't add up...
1434                  */
1435                 matchprodfreq = 0.0;
1436                 nmatches = 0;
1437                 for (i = 0; i < nvalues1; i++)
1438                 {
1439                         int                     j;
1440
1441                         for (j = 0; j < nvalues2; j++)
1442                         {
1443                                 if (hasmatch2[j])
1444                                         continue;
1445                                 if (DatumGetBool(FunctionCall2(&eqproc,
1446                                                                                            values1[i],
1447                                                                                            values2[j])))
1448                                 {
1449                                         hasmatch1[i] = hasmatch2[j] = true;
1450                                         matchprodfreq += numbers1[i] * numbers2[j];
1451                                         nmatches++;
1452                                         break;
1453                                 }
1454                         }
1455                 }
1456                 CLAMP_PROBABILITY(matchprodfreq);
1457                 /* Sum up frequencies of matched and unmatched MCVs */
1458                 matchfreq1 = unmatchfreq1 = 0.0;
1459                 for (i = 0; i < nvalues1; i++)
1460                 {
1461                         if (hasmatch1[i])
1462                                 matchfreq1 += numbers1[i];
1463                         else
1464                                 unmatchfreq1 += numbers1[i];
1465                 }
1466                 CLAMP_PROBABILITY(matchfreq1);
1467                 CLAMP_PROBABILITY(unmatchfreq1);
1468                 matchfreq2 = unmatchfreq2 = 0.0;
1469                 for (i = 0; i < nvalues2; i++)
1470                 {
1471                         if (hasmatch2[i])
1472                                 matchfreq2 += numbers2[i];
1473                         else
1474                                 unmatchfreq2 += numbers2[i];
1475                 }
1476                 CLAMP_PROBABILITY(matchfreq2);
1477                 CLAMP_PROBABILITY(unmatchfreq2);
1478                 pfree(hasmatch1);
1479                 pfree(hasmatch2);
1480
1481                 /*
1482                  * Compute total frequency of non-null values that are not in the
1483                  * MCV lists.
1484                  */
1485                 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1486                 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1487                 CLAMP_PROBABILITY(otherfreq1);
1488                 CLAMP_PROBABILITY(otherfreq2);
1489
1490                 /*
1491                  * We can estimate the total selectivity from the point of view of
1492                  * relation 1 as: the known selectivity for matched MCVs, plus
1493                  * unmatched MCVs that are assumed to match against random members
1494                  * of relation 2's non-MCV population, plus non-MCV values that
1495                  * are assumed to match against random members of relation 2's
1496                  * unmatched MCVs plus non-MCV values.
1497                  */
1498                 totalsel1 = matchprodfreq;
1499                 if (nd2 > nvalues2)
1500                         totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1501                 if (nd2 > nmatches)
1502                         totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1503                                 (nd2 - nmatches);
1504                 /* Same estimate from the point of view of relation 2. */
1505                 totalsel2 = matchprodfreq;
1506                 if (nd1 > nvalues1)
1507                         totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1508                 if (nd1 > nmatches)
1509                         totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1510                                 (nd1 - nmatches);
1511
1512                 /*
1513                  * Use the smaller of the two estimates.  This can be justified in
1514                  * essentially the same terms as given below for the no-stats
1515                  * case: to a first approximation, we are estimating from the
1516                  * point of view of the relation with smaller nd.
1517                  */
1518                 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1519         }
1520         else
1521         {
1522                 /*
1523                  * We do not have MCV lists for both sides.  Estimate the join
1524                  * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2).
1525                  * This is plausible if we assume that the join operator is strict
1526                  * and the non-null values are about equally distributed: a given
1527                  * non-null tuple of rel1 will join to either zero or
1528                  * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are at
1529                  * most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
1530                  * selectivity of not more than (1-nullfrac1)*(1-nullfrac2)/nd2.
1531                  * By the same logic it is not more than
1532                  * (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression with MIN()
1533                  * is an upper bound.  Using the MIN() means we estimate from the
1534                  * point of view of the relation with smaller nd (since the larger
1535                  * nd is determining the MIN).  It is reasonable to assume that
1536                  * most tuples in this rel will have join partners, so the bound
1537                  * is probably reasonably tight and should be taken as-is.
1538                  *
1539                  * XXX Can we be smarter if we have an MCV list for just one side? It
1540                  * seems that if we assume equal distribution for the other side,
1541                  * we end up with the same answer anyway.
1542                  */
1543                 double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
1544                 double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
1545
1546                 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
1547                 if (nd1 > nd2)
1548                         selec /= nd1;
1549                 else
1550                         selec /= nd2;
1551         }
1552
1553         if (have_mcvs1)
1554                 free_attstatsslot(vardata1.atttype, values1, nvalues1,
1555                                                   numbers1, nnumbers1);
1556         if (have_mcvs2)
1557                 free_attstatsslot(vardata2.atttype, values2, nvalues2,
1558                                                   numbers2, nnumbers2);
1559
1560         ReleaseVariableStats(vardata1);
1561         ReleaseVariableStats(vardata2);
1562
1563         CLAMP_PROBABILITY(selec);
1564
1565         PG_RETURN_FLOAT8((float8) selec);
1566 }
1567
1568 /*
1569  *              neqjoinsel              - Join selectivity of "!="
1570  */
1571 Datum
1572 neqjoinsel(PG_FUNCTION_ARGS)
1573 {
1574         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1575         Oid                     operator = PG_GETARG_OID(1);
1576         List       *args = (List *) PG_GETARG_POINTER(2);
1577         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1578         Oid                     eqop;
1579         float8          result;
1580
1581         /*
1582          * We want 1 - eqjoinsel() where the equality operator is the one
1583          * associated with this != operator, that is, its negator.
1584          */
1585         eqop = get_negator(operator);
1586         if (eqop)
1587         {
1588                 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
1589                                                                                                         PointerGetDatum(root),
1590                                                                                                   ObjectIdGetDatum(eqop),
1591                                                                                                         PointerGetDatum(args),
1592                                                                                            Int16GetDatum(jointype)));
1593         }
1594         else
1595         {
1596                 /* Use default selectivity (should we raise an error instead?) */
1597                 result = DEFAULT_EQ_SEL;
1598         }
1599         result = 1.0 - result;
1600         PG_RETURN_FLOAT8(result);
1601 }
1602
1603 /*
1604  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
1605  */
1606 Datum
1607 scalarltjoinsel(PG_FUNCTION_ARGS)
1608 {
1609         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1610 }
1611
1612 /*
1613  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
1614  */
1615 Datum
1616 scalargtjoinsel(PG_FUNCTION_ARGS)
1617 {
1618         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1619 }
1620
1621 /*
1622  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
1623  */
1624 Datum
1625 regexeqjoinsel(PG_FUNCTION_ARGS)
1626 {
1627         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1628 }
1629
1630 /*
1631  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
1632  */
1633 Datum
1634 icregexeqjoinsel(PG_FUNCTION_ARGS)
1635 {
1636         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1637 }
1638
1639 /*
1640  *              likejoinsel                     - Join selectivity of LIKE pattern match.
1641  */
1642 Datum
1643 likejoinsel(PG_FUNCTION_ARGS)
1644 {
1645         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1646 }
1647
1648 /*
1649  *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
1650  */
1651 Datum
1652 iclikejoinsel(PG_FUNCTION_ARGS)
1653 {
1654         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1655 }
1656
1657 /*
1658  *              regexnejoinsel  - Join selectivity of regex non-match.
1659  */
1660 Datum
1661 regexnejoinsel(PG_FUNCTION_ARGS)
1662 {
1663         float8          result;
1664
1665         result = DatumGetFloat8(regexeqjoinsel(fcinfo));
1666         result = 1.0 - result;
1667         PG_RETURN_FLOAT8(result);
1668 }
1669
1670 /*
1671  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
1672  */
1673 Datum
1674 icregexnejoinsel(PG_FUNCTION_ARGS)
1675 {
1676         float8          result;
1677
1678         result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
1679         result = 1.0 - result;
1680         PG_RETURN_FLOAT8(result);
1681 }
1682
1683 /*
1684  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
1685  */
1686 Datum
1687 nlikejoinsel(PG_FUNCTION_ARGS)
1688 {
1689         float8          result;
1690
1691         result = DatumGetFloat8(likejoinsel(fcinfo));
1692         result = 1.0 - result;
1693         PG_RETURN_FLOAT8(result);
1694 }
1695
1696 /*
1697  *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
1698  */
1699 Datum
1700 icnlikejoinsel(PG_FUNCTION_ARGS)
1701 {
1702         float8          result;
1703
1704         result = DatumGetFloat8(iclikejoinsel(fcinfo));
1705         result = 1.0 - result;
1706         PG_RETURN_FLOAT8(result);
1707 }
1708
1709 /*
1710  * mergejoinscansel                     - Scan selectivity of merge join.
1711  *
1712  * A merge join will stop as soon as it exhausts either input stream.
1713  * Therefore, if we can estimate the ranges of both input variables,
1714  * we can estimate how much of the input will actually be read.  This
1715  * can have a considerable impact on the cost when using indexscans.
1716  *
1717  * clause should be a clause already known to be mergejoinable.
1718  *
1719  * *leftscan is set to the fraction of the left-hand variable expected
1720  * to be scanned (0 to 1), and similarly *rightscan for the right-hand
1721  * variable.
1722  */
1723 void
1724 mergejoinscansel(PlannerInfo *root, Node *clause,
1725                                  Selectivity *leftscan,
1726                                  Selectivity *rightscan)
1727 {
1728         Node       *left,
1729                            *right;
1730         VariableStatData leftvar,
1731                                 rightvar;
1732         Oid                     lefttype,
1733                                 righttype;
1734         Oid                     opno,
1735                                 lsortop,
1736                                 rsortop,
1737                                 ltop,
1738                                 gtop,
1739                                 leop,
1740                                 revgtop,
1741                                 revleop;
1742         Datum           leftmax,
1743                                 rightmax;
1744         double          selec;
1745
1746         /* Set default results if we can't figure anything out. */
1747         *leftscan = *rightscan = 1.0;
1748
1749         /* Deconstruct the merge clause */
1750         if (!is_opclause(clause))
1751                 return;                                 /* shouldn't happen */
1752         opno = ((OpExpr *) clause)->opno;
1753         left = get_leftop((Expr *) clause);
1754         right = get_rightop((Expr *) clause);
1755         if (!right)
1756                 return;                                 /* shouldn't happen */
1757
1758         /* Look for stats for the inputs */
1759         examine_variable(root, left, 0, &leftvar);
1760         examine_variable(root, right, 0, &rightvar);
1761
1762         /* Get the direct input types of the operator */
1763         lefttype = exprType(left);
1764         righttype = exprType(right);
1765
1766         /* Verify mergejoinability and get left and right "<" operators */
1767         if (!op_mergejoinable(opno,
1768                                                   &lsortop,
1769                                                   &rsortop))
1770                 goto fail;                              /* shouldn't happen */
1771
1772         /* Try to get maximum values of both inputs */
1773         if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
1774                 goto fail;                              /* no max available from stats */
1775
1776         if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
1777                 goto fail;                              /* no max available from stats */
1778
1779         /* Look up the "left < right" and "left > right" operators */
1780         op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);
1781
1782         /* Look up the "left <= right" operator */
1783         leop = get_negator(gtop);
1784         if (!OidIsValid(leop))
1785                 goto fail;                              /* insufficient info in catalogs */
1786
1787         /* Look up the "right > left" operator */
1788         revgtop = get_commutator(ltop);
1789         if (!OidIsValid(revgtop))
1790                 goto fail;                              /* insufficient info in catalogs */
1791
1792         /* Look up the "right <= left" operator */
1793         revleop = get_negator(revgtop);
1794         if (!OidIsValid(revleop))
1795                 goto fail;                              /* insufficient info in catalogs */
1796
1797         /*
1798          * Now, the fraction of the left variable that will be scanned is the
1799          * fraction that's <= the right-side maximum value.  But only believe
1800          * non-default estimates, else stick with our 1.0.
1801          */
1802         selec = scalarineqsel(root, leop, false, &leftvar,
1803                                                   rightmax, righttype);
1804         if (selec != DEFAULT_INEQ_SEL)
1805                 *leftscan = selec;
1806
1807         /* And similarly for the right variable. */
1808         selec = scalarineqsel(root, revleop, false, &rightvar,
1809                                                   leftmax, lefttype);
1810         if (selec != DEFAULT_INEQ_SEL)
1811                 *rightscan = selec;
1812
1813         /*
1814          * Only one of the two fractions can really be less than 1.0; believe
1815          * the smaller estimate and reset the other one to exactly 1.0.  If we
1816          * get exactly equal estimates (as can easily happen with self-joins),
1817          * believe neither.
1818          */
1819         if (*leftscan > *rightscan)
1820                 *leftscan = 1.0;
1821         else if (*leftscan < *rightscan)
1822                 *rightscan = 1.0;
1823         else
1824                 *leftscan = *rightscan = 1.0;
1825
1826 fail:
1827         ReleaseVariableStats(leftvar);
1828         ReleaseVariableStats(rightvar);
1829 }
1830
1831
1832 /*
1833  * Helper routine for estimate_num_groups: add an item to a list of
1834  * GroupVarInfos, but only if it's not known equal to any of the existing
1835  * entries.
1836  */
1837 typedef struct
1838 {
1839         Node       *var;                /* might be an expression, not just a Var */
1840         RelOptInfo *rel;                /* relation it belongs to */
1841         double          ndistinct;      /* # distinct values */
1842 } GroupVarInfo;
1843
1844 static List *
1845 add_unique_group_var(PlannerInfo *root, List *varinfos,
1846                                          Node *var, VariableStatData *vardata)
1847 {
1848         GroupVarInfo *varinfo;
1849         double          ndistinct;
1850         ListCell   *lc;
1851
1852         ndistinct = get_variable_numdistinct(vardata);
1853
1854         /* cannot use foreach here because of possible list_delete */
1855         lc = list_head(varinfos);
1856         while (lc)
1857         {
1858                 varinfo = (GroupVarInfo *) lfirst(lc);
1859
1860                 /* must advance lc before list_delete possibly pfree's it */
1861                 lc = lnext(lc);
1862
1863                 /* Drop exact duplicates */
1864                 if (equal(var, varinfo->var))
1865                         return varinfos;
1866
1867                 /*
1868                  * Drop known-equal vars, but only if they belong to different
1869                  * relations (see comments for estimate_num_groups)
1870                  */
1871                 if (vardata->rel != varinfo->rel &&
1872                         exprs_known_equal(root, var, varinfo->var))
1873                 {
1874                         if (varinfo->ndistinct <= ndistinct)
1875                         {
1876                                 /* Keep older item, forget new one */
1877                                 return varinfos;
1878                         }
1879                         else
1880                         {
1881                                 /* Delete the older item */
1882                                 varinfos = list_delete_ptr(varinfos, varinfo);
1883                         }
1884                 }
1885         }
1886
1887         varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
1888
1889         varinfo->var = var;
1890         varinfo->rel = vardata->rel;
1891         varinfo->ndistinct = ndistinct;
1892         varinfos = lappend(varinfos, varinfo);
1893         return varinfos;
1894 }
1895
1896 /*
1897  * estimate_num_groups          - Estimate number of groups in a grouped query
1898  *
1899  * Given a query having a GROUP BY clause, estimate how many groups there
1900  * will be --- ie, the number of distinct combinations of the GROUP BY
1901  * expressions.
1902  *
1903  * This routine is also used to estimate the number of rows emitted by
1904  * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
1905  * actually, we only use it for DISTINCT when there's no grouping or
1906  * aggregation ahead of the DISTINCT.)
1907  *
1908  * Inputs:
1909  *      root - the query
1910  *      groupExprs - list of expressions being grouped by
1911  *      input_rows - number of rows estimated to arrive at the group/unique
1912  *              filter step
1913  *
1914  * Given the lack of any cross-correlation statistics in the system, it's
1915  * impossible to do anything really trustworthy with GROUP BY conditions
1916  * involving multiple Vars.  We should however avoid assuming the worst
1917  * case (all possible cross-product terms actually appear as groups) since
1918  * very often the grouped-by Vars are highly correlated.  Our current approach
1919  * is as follows:
1920  *      1.      Reduce the given expressions to a list of unique Vars used.  For
1921  *              example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
1922  *              It is clearly correct not to count the same Var more than once.
1923  *              It is also reasonable to treat f(x) the same as x: f() cannot
1924  *              increase the number of distinct values (unless it is volatile,
1925  *              which we consider unlikely for grouping), but it probably won't
1926  *              reduce the number of distinct values much either.
1927  *              As a special case, if a GROUP BY expression can be matched to an
1928  *              expressional index for which we have statistics, then we treat the
1929  *              whole expression as though it were just a Var.
1930  *      2.      If the list contains Vars of different relations that are known equal
1931  *              due to equijoin clauses, then drop all but one of the Vars from each
1932  *              known-equal set, keeping the one with smallest estimated # of values
1933  *              (since the extra values of the others can't appear in joined rows).
1934  *              Note the reason we only consider Vars of different relations is that
1935  *              if we considered ones of the same rel, we'd be double-counting the
1936  *              restriction selectivity of the equality in the next step.
1937  *      3.      For Vars within a single source rel, we multiply together the numbers
1938  *              of values, clamp to the number of rows in the rel (divided by 10 if
1939  *              more than one Var), and then multiply by the selectivity of the
1940  *              restriction clauses for that rel.  When there's more than one Var,
1941  *              the initial product is probably too high (it's the worst case) but
1942  *              clamping to a fraction of the rel's rows seems to be a helpful
1943  *              heuristic for not letting the estimate get out of hand.  (The factor
1944  *              of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
1945  *              by the restriction selectivity is effectively assuming that the
1946  *              restriction clauses are independent of the grouping, which is a crummy
1947  *              assumption, but it's hard to do better.
1948  *      4.      If there are Vars from multiple rels, we repeat step 3 for each such
1949  *              rel, and multiply the results together.
1950  * Note that rels not containing grouped Vars are ignored completely, as are
1951  * join clauses other than the equijoin clauses used in step 2.  Such rels
1952  * cannot increase the number of groups, and we assume such clauses do not
1953  * reduce the number either (somewhat bogus, but we don't have the info to
1954  * do better).
1955  */
1956 double
1957 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
1958 {
1959         List       *varinfos = NIL;
1960         double          numdistinct;
1961         ListCell   *l;
1962
1963         /* We should not be called unless query has GROUP BY (or DISTINCT) */
1964         Assert(groupExprs != NIL);
1965
1966         /*
1967          * Steps 1/2: find the unique Vars used, treating an expression as a Var
1968          * if we can find stats for it.  For each one, record the statistical
1969          * estimate of number of distinct values (total in its table, without
1970          * regard for filtering).
1971          */
1972         foreach(l, groupExprs)
1973         {
1974                 Node       *groupexpr = (Node *) lfirst(l);
1975                 VariableStatData vardata;
1976                 List       *varshere;
1977                 ListCell   *l2;
1978
1979                 /*
1980                  * If examine_variable is able to deduce anything about the GROUP BY
1981                  * expression, treat it as a single variable even if it's really more
1982                  * complicated.
1983                  */
1984                 examine_variable(root, groupexpr, 0, &vardata);
1985                 if (vardata.statsTuple != NULL || vardata.isunique)
1986                 {
1987                         varinfos = add_unique_group_var(root, varinfos,
1988                                                                                         groupexpr, &vardata);
1989                         ReleaseVariableStats(vardata);
1990                         continue;
1991                 }
1992                 ReleaseVariableStats(vardata);
1993
1994                 /*
1995                  * Else pull out the component Vars
1996                  */
1997                 varshere = pull_var_clause(groupexpr, false);
1998
1999                 /*
2000                  * If we find any variable-free GROUP BY item, then either it is a
2001                  * constant (and we can ignore it) or it contains a volatile
2002                  * function; in the latter case we punt and assume that each input
2003                  * row will yield a distinct group.
2004                  */
2005                 if (varshere == NIL)
2006                 {
2007                         if (contain_volatile_functions(groupexpr))
2008                                 return input_rows;
2009                         continue;
2010                 }
2011
2012                 /*
2013                  * Else add variables to varinfos list
2014                  */
2015                 foreach(l2, varshere)
2016                 {
2017                         Node       *var = (Node *) lfirst(l2);
2018
2019                         examine_variable(root, var, 0, &vardata);
2020                         varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2021                         ReleaseVariableStats(vardata);
2022                 }
2023         }
2024
2025         /* If now no Vars, we must have an all-constant GROUP BY list. */
2026         if (varinfos == NIL)
2027                 return 1.0;
2028
2029         /*
2030          * Steps 3/4: group Vars by relation and estimate total numdistinct.
2031          *
2032          * For each iteration of the outer loop, we process the frontmost Var in
2033          * varinfos, plus all other Vars in the same relation.  We remove
2034          * these Vars from the newvarinfos list for the next iteration. This
2035          * is the easiest way to group Vars of same rel together.
2036          */
2037         numdistinct = 1.0;
2038
2039         do
2040         {
2041                 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2042                 RelOptInfo *rel = varinfo1->rel;
2043                 double          reldistinct = varinfo1->ndistinct;
2044                 double          relmaxndistinct = reldistinct;
2045                 int                     relvarcount = 1;
2046                 List       *newvarinfos = NIL;
2047
2048                 /*
2049                  * Get the product of numdistinct estimates of the Vars for this rel.
2050                  * Also, construct new varinfos list of remaining Vars.
2051                  */
2052                 for_each_cell(l, lnext(list_head(varinfos)))
2053                 {
2054                         GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2055
2056                         if (varinfo2->rel == varinfo1->rel)
2057                         {
2058                                 reldistinct *= varinfo2->ndistinct;
2059                                 if (relmaxndistinct < varinfo2->ndistinct)
2060                                         relmaxndistinct = varinfo2->ndistinct;
2061                                 relvarcount++;
2062                         }
2063                         else
2064                         {
2065                                 /* not time to process varinfo2 yet */
2066                                 newvarinfos = lcons(varinfo2, newvarinfos);
2067                         }
2068                 }
2069
2070                 /*
2071                  * Sanity check --- don't divide by zero if empty relation.
2072                  */
2073                 Assert(rel->reloptkind == RELOPT_BASEREL);
2074                 if (rel->tuples > 0)
2075                 {
2076                         /*
2077                          * Clamp to size of rel, or size of rel / 10 if multiple Vars.
2078                          * The fudge factor is because the Vars are probably correlated
2079                          * but we don't know by how much.  We should never clamp to less
2080                          * than the largest ndistinct value for any of the Vars, though,
2081                          * since there will surely be at least that many groups.
2082                          */
2083                         double          clamp = rel->tuples;
2084
2085                         if (relvarcount > 1)
2086                         {
2087                                 clamp *= 0.1;
2088                                 if (clamp < relmaxndistinct)
2089                                 {
2090                                         clamp = relmaxndistinct;
2091                                         /* for sanity in case some ndistinct is too large: */
2092                                         if (clamp > rel->tuples)
2093                                                 clamp = rel->tuples;
2094                                 }
2095                         }
2096                         if (reldistinct > clamp)
2097                                 reldistinct = clamp;
2098
2099                         /*
2100                          * Multiply by restriction selectivity.
2101                          */
2102                         reldistinct *= rel->rows / rel->tuples;
2103
2104                         /*
2105                          * Update estimate of total distinct groups.
2106                          */
2107                         numdistinct *= reldistinct;
2108                 }
2109
2110                 varinfos = newvarinfos;
2111         } while (varinfos != NIL);
2112
2113         numdistinct = ceil(numdistinct);
2114
2115         /* Guard against out-of-range answers */
2116         if (numdistinct > input_rows)
2117                 numdistinct = input_rows;
2118         if (numdistinct < 1.0)
2119                 numdistinct = 1.0;
2120
2121         return numdistinct;
2122 }
2123
2124 /*
2125  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2126  * divided by total tuples in relation) if the specified expression is used
2127  * as a hash key.
2128  *
2129  * XXX This is really pretty bogus since we're effectively assuming that the
2130  * distribution of hash keys will be the same after applying restriction
2131  * clauses as it was in the underlying relation.  However, we are not nearly
2132  * smart enough to figure out how the restrict clauses might change the
2133  * distribution, so this will have to do for now.
2134  *
2135  * We are passed the number of buckets the executor will use for the given
2136  * input relation.      If the data were perfectly distributed, with the same
2137  * number of tuples going into each available bucket, then the bucketsize
2138  * fraction would be 1/nbuckets.  But this happy state of affairs will occur
2139  * only if (a) there are at least nbuckets distinct data values, and (b)
2140  * we have a not-too-skewed data distribution.  Otherwise the buckets will
2141  * be nonuniformly occupied.  If the other relation in the join has a key
2142  * distribution similar to this one's, then the most-loaded buckets are
2143  * exactly those that will be probed most often.  Therefore, the "average"
2144  * bucket size for costing purposes should really be taken as something close
2145  * to the "worst case" bucket size.  We try to estimate this by adjusting the
2146  * fraction if there are too few distinct data values, and then scaling up
2147  * by the ratio of the most common value's frequency to the average frequency.
2148  *
2149  * If no statistics are available, use a default estimate of 0.1.  This will
2150  * discourage use of a hash rather strongly if the inner relation is large,
2151  * which is what we want.  We do not want to hash unless we know that the
2152  * inner rel is well-dispersed (or the alternatives seem much worse).
2153  */
2154 Selectivity
2155 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2156 {
2157         VariableStatData vardata;
2158         double          estfract,
2159                                 ndistinct,
2160                                 stanullfrac,
2161                                 mcvfreq,
2162                                 avgfreq;
2163         float4     *numbers;
2164         int                     nnumbers;
2165
2166         examine_variable(root, hashkey, 0, &vardata);
2167
2168         /* Get number of distinct values and fraction that are null */
2169         ndistinct = get_variable_numdistinct(&vardata);
2170
2171         if (HeapTupleIsValid(vardata.statsTuple))
2172         {
2173                 Form_pg_statistic stats;
2174
2175                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2176                 stanullfrac = stats->stanullfrac;
2177         }
2178         else
2179         {
2180                 /*
2181                  * Believe a default ndistinct only if it came from stats.
2182                  * Otherwise punt and return 0.1, per comments above.
2183                  */
2184                 if (ndistinct == DEFAULT_NUM_DISTINCT)
2185                 {
2186                         ReleaseVariableStats(vardata);
2187                         return (Selectivity) 0.1;
2188                 }
2189
2190                 stanullfrac = 0.0;
2191         }
2192
2193         /* Compute avg freq of all distinct data values in raw relation */
2194         avgfreq = (1.0 - stanullfrac) / ndistinct;
2195
2196         /*
2197          * Adjust ndistinct to account for restriction clauses.  Observe we
2198          * are assuming that the data distribution is affected uniformly by
2199          * the restriction clauses!
2200          *
2201          * XXX Possibly better way, but much more expensive: multiply by
2202          * selectivity of rel's restriction clauses that mention the target
2203          * Var.
2204          */
2205         if (vardata.rel)
2206                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2207
2208         /*
2209          * Initial estimate of bucketsize fraction is 1/nbuckets as long as
2210          * the number of buckets is less than the expected number of distinct
2211          * values; otherwise it is 1/ndistinct.
2212          */
2213         if (ndistinct > nbuckets)
2214                 estfract = 1.0 / nbuckets;
2215         else
2216                 estfract = 1.0 / ndistinct;
2217
2218         /*
2219          * Look up the frequency of the most common value, if available.
2220          */
2221         mcvfreq = 0.0;
2222
2223         if (HeapTupleIsValid(vardata.statsTuple))
2224         {
2225                 if (get_attstatsslot(vardata.statsTuple,
2226                                                          vardata.atttype, vardata.atttypmod,
2227                                                          STATISTIC_KIND_MCV, InvalidOid,
2228                                                          NULL, NULL, &numbers, &nnumbers))
2229                 {
2230                         /*
2231                          * The first MCV stat is for the most common value.
2232                          */
2233                         if (nnumbers > 0)
2234                                 mcvfreq = numbers[0];
2235                         free_attstatsslot(vardata.atttype, NULL, 0,
2236                                                           numbers, nnumbers);
2237                 }
2238         }
2239
2240         /*
2241          * Adjust estimated bucketsize upward to account for skewed
2242          * distribution.
2243          */
2244         if (avgfreq > 0.0 && mcvfreq > avgfreq)
2245                 estfract *= mcvfreq / avgfreq;
2246
2247         /*
2248          * Clamp bucketsize to sane range (the above adjustment could easily
2249          * produce an out-of-range result).  We set the lower bound a little
2250          * above zero, since zero isn't a very sane result.
2251          */
2252         if (estfract < 1.0e-6)
2253                 estfract = 1.0e-6;
2254         else if (estfract > 1.0)
2255                 estfract = 1.0;
2256
2257         ReleaseVariableStats(vardata);
2258
2259         return (Selectivity) estfract;
2260 }
2261
2262
2263 /*-------------------------------------------------------------------------
2264  *
2265  * Support routines
2266  *
2267  *-------------------------------------------------------------------------
2268  */
2269
2270 /*
2271  * convert_to_scalar
2272  *        Convert non-NULL values of the indicated types to the comparison
2273  *        scale needed by scalarltsel()/scalargtsel().
2274  *        Returns "true" if successful.
2275  *
2276  * XXX this routine is a hack: ideally we should look up the conversion
2277  * subroutines in pg_type.
2278  *
2279  * All numeric datatypes are simply converted to their equivalent
2280  * "double" values.  (NUMERIC values that are outside the range of "double"
2281  * are clamped to +/- HUGE_VAL.)
2282  *
2283  * String datatypes are converted by convert_string_to_scalar(),
2284  * which is explained below.  The reason why this routine deals with
2285  * three values at a time, not just one, is that we need it for strings.
2286  *
2287  * The bytea datatype is just enough different from strings that it has
2288  * to be treated separately.
2289  *
2290  * The several datatypes representing absolute times are all converted
2291  * to Timestamp, which is actually a double, and then we just use that
2292  * double value.  Note this will give correct results even for the "special"
2293  * values of Timestamp, since those are chosen to compare correctly;
2294  * see timestamp_cmp.
2295  *
2296  * The several datatypes representing relative times (intervals) are all
2297  * converted to measurements expressed in seconds.
2298  */
2299 static bool
2300 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2301                                   Datum lobound, Datum hibound, Oid boundstypid,
2302                                   double *scaledlobound, double *scaledhibound)
2303 {
2304         /*
2305          * Both the valuetypid and the boundstypid should exactly match
2306          * the declared input type(s) of the operator we are invoked for,
2307          * so we just error out if either is not recognized.
2308          *
2309          * XXX The histogram we are interpolating between points of could belong
2310          * to a column that's only binary-compatible with the declared type.
2311          * In essence we are assuming that the semantics of binary-compatible
2312          * types are enough alike that we can use a histogram generated with one
2313          * type's operators to estimate selectivity for the other's.  This is
2314          * outright wrong in some cases --- in particular signed versus unsigned
2315          * interpretation could trip us up.  But it's useful enough in the
2316          * majority of cases that we do it anyway.  Should think about more
2317          * rigorous ways to do it.
2318          */
2319         switch (valuetypid)
2320         {
2321                         /*
2322                          * Built-in numeric types
2323                          */
2324                 case BOOLOID:
2325                 case INT2OID:
2326                 case INT4OID:
2327                 case INT8OID:
2328                 case FLOAT4OID:
2329                 case FLOAT8OID:
2330                 case NUMERICOID:
2331                 case OIDOID:
2332                 case REGPROCOID:
2333                 case REGPROCEDUREOID:
2334                 case REGOPEROID:
2335                 case REGOPERATOROID:
2336                 case REGCLASSOID:
2337                 case REGTYPEOID:
2338                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2339                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2340                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2341                         return true;
2342
2343                         /*
2344                          * Built-in string types
2345                          */
2346                 case CHAROID:
2347                 case BPCHAROID:
2348                 case VARCHAROID:
2349                 case TEXTOID:
2350                 case NAMEOID:
2351                         {
2352                                 unsigned char *valstr = convert_string_datum(value, valuetypid);
2353                                 unsigned char *lostr = convert_string_datum(lobound, boundstypid);
2354                                 unsigned char *histr = convert_string_datum(hibound, boundstypid);
2355
2356                                 convert_string_to_scalar(valstr, scaledvalue,
2357                                                                                  lostr, scaledlobound,
2358                                                                                  histr, scaledhibound);
2359                                 pfree(valstr);
2360                                 pfree(lostr);
2361                                 pfree(histr);
2362                                 return true;
2363                         }
2364
2365                         /*
2366                          * Built-in bytea type
2367                          */
2368                 case BYTEAOID:
2369                         {
2370                                 convert_bytea_to_scalar(value, scaledvalue,
2371                                                                                 lobound, scaledlobound,
2372                                                                                 hibound, scaledhibound);
2373                                 return true;
2374                         }
2375
2376                         /*
2377                          * Built-in time types
2378                          */
2379                 case TIMESTAMPOID:
2380                 case TIMESTAMPTZOID:
2381                 case ABSTIMEOID:
2382                 case DATEOID:
2383                 case INTERVALOID:
2384                 case RELTIMEOID:
2385                 case TINTERVALOID:
2386                 case TIMEOID:
2387                 case TIMETZOID:
2388                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2389                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2390                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2391                         return true;
2392
2393                         /*
2394                          * Built-in network types
2395                          */
2396                 case INETOID:
2397                 case CIDROID:
2398                 case MACADDROID:
2399                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
2400                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2401                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2402                         return true;
2403         }
2404         /* Don't know how to convert */
2405         return false;
2406 }
2407
2408 /*
2409  * Do convert_to_scalar()'s work for any numeric data type.
2410  */
2411 static double
2412 convert_numeric_to_scalar(Datum value, Oid typid)
2413 {
2414         switch (typid)
2415         {
2416                 case BOOLOID:
2417                         return (double) DatumGetBool(value);
2418                 case INT2OID:
2419                         return (double) DatumGetInt16(value);
2420                 case INT4OID:
2421                         return (double) DatumGetInt32(value);
2422                 case INT8OID:
2423                         return (double) DatumGetInt64(value);
2424                 case FLOAT4OID:
2425                         return (double) DatumGetFloat4(value);
2426                 case FLOAT8OID:
2427                         return (double) DatumGetFloat8(value);
2428                 case NUMERICOID:
2429                         /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2430                         return (double)
2431                                 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2432                                                                                                    value));
2433                 case OIDOID:
2434                 case REGPROCOID:
2435                 case REGPROCEDUREOID:
2436                 case REGOPEROID:
2437                 case REGOPERATOROID:
2438                 case REGCLASSOID:
2439                 case REGTYPEOID:
2440                         /* we can treat OIDs as integers... */
2441                         return (double) DatumGetObjectId(value);
2442         }
2443
2444         /*
2445          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2446          * on an operator with one numeric and one non-numeric operand.
2447          */
2448         elog(ERROR, "unsupported type: %u", typid);
2449         return 0;
2450 }
2451
2452 /*
2453  * Do convert_to_scalar()'s work for any character-string data type.
2454  *
2455  * String datatypes are converted to a scale that ranges from 0 to 1,
2456  * where we visualize the bytes of the string as fractional digits.
2457  *
2458  * We do not want the base to be 256, however, since that tends to
2459  * generate inflated selectivity estimates; few databases will have
2460  * occurrences of all 256 possible byte values at each position.
2461  * Instead, use the smallest and largest byte values seen in the bounds
2462  * as the estimated range for each byte, after some fudging to deal with
2463  * the fact that we probably aren't going to see the full range that way.
2464  *
2465  * An additional refinement is that we discard any common prefix of the
2466  * three strings before computing the scaled values.  This allows us to
2467  * "zoom in" when we encounter a narrow data range.  An example is a phone
2468  * number database where all the values begin with the same area code.
2469  * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2470  * so this is more likely to happen than you might think.)
2471  */
2472 static void
2473 convert_string_to_scalar(unsigned char *value,
2474                                                  double *scaledvalue,
2475                                                  unsigned char *lobound,
2476                                                  double *scaledlobound,
2477                                                  unsigned char *hibound,
2478                                                  double *scaledhibound)
2479 {
2480         int                     rangelo,
2481                                 rangehi;
2482         unsigned char *sptr;
2483
2484         rangelo = rangehi = hibound[0];
2485         for (sptr = lobound; *sptr; sptr++)
2486         {
2487                 if (rangelo > *sptr)
2488                         rangelo = *sptr;
2489                 if (rangehi < *sptr)
2490                         rangehi = *sptr;
2491         }
2492         for (sptr = hibound; *sptr; sptr++)
2493         {
2494                 if (rangelo > *sptr)
2495                         rangelo = *sptr;
2496                 if (rangehi < *sptr)
2497                         rangehi = *sptr;
2498         }
2499         /* If range includes any upper-case ASCII chars, make it include all */
2500         if (rangelo <= 'Z' && rangehi >= 'A')
2501         {
2502                 if (rangelo > 'A')
2503                         rangelo = 'A';
2504                 if (rangehi < 'Z')
2505                         rangehi = 'Z';
2506         }
2507         /* Ditto lower-case */
2508         if (rangelo <= 'z' && rangehi >= 'a')
2509         {
2510                 if (rangelo > 'a')
2511                         rangelo = 'a';
2512                 if (rangehi < 'z')
2513                         rangehi = 'z';
2514         }
2515         /* Ditto digits */
2516         if (rangelo <= '9' && rangehi >= '0')
2517         {
2518                 if (rangelo > '0')
2519                         rangelo = '0';
2520                 if (rangehi < '9')
2521                         rangehi = '9';
2522         }
2523
2524         /*
2525          * If range includes less than 10 chars, assume we have not got enough
2526          * data, and make it include regular ASCII set.
2527          */
2528         if (rangehi - rangelo < 9)
2529         {
2530                 rangelo = ' ';
2531                 rangehi = 127;
2532         }
2533
2534         /*
2535          * Now strip any common prefix of the three strings.
2536          */
2537         while (*lobound)
2538         {
2539                 if (*lobound != *hibound || *lobound != *value)
2540                         break;
2541                 lobound++, hibound++, value++;
2542         }
2543
2544         /*
2545          * Now we can do the conversions.
2546          */
2547         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2548         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2549         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2550 }
2551
2552 static double
2553 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
2554 {
2555         int                     slen = strlen((char *) value);
2556         double          num,
2557                                 denom,
2558                                 base;
2559
2560         if (slen <= 0)
2561                 return 0.0;                             /* empty string has scalar value 0 */
2562
2563         /*
2564          * Since base is at least 10, need not consider more than about 20
2565          * chars
2566          */
2567         if (slen > 20)
2568                 slen = 20;
2569
2570         /* Convert initial characters to fraction */
2571         base = rangehi - rangelo + 1;
2572         num = 0.0;
2573         denom = base;
2574         while (slen-- > 0)
2575         {
2576                 int                     ch = *value++;
2577
2578                 if (ch < rangelo)
2579                         ch = rangelo - 1;
2580                 else if (ch > rangehi)
2581                         ch = rangehi + 1;
2582                 num += ((double) (ch - rangelo)) / denom;
2583                 denom *= base;
2584         }
2585
2586         return num;
2587 }
2588
2589 /*
2590  * Convert a string-type Datum into a palloc'd, null-terminated string.
2591  *
2592  * When using a non-C locale, we must pass the string through strxfrm()
2593  * before continuing, so as to generate correct locale-specific results.
2594  */
2595 static unsigned char *
2596 convert_string_datum(Datum value, Oid typid)
2597 {
2598         char       *val;
2599
2600         switch (typid)
2601         {
2602                 case CHAROID:
2603                         val = (char *) palloc(2);
2604                         val[0] = DatumGetChar(value);
2605                         val[1] = '\0';
2606                         break;
2607                 case BPCHAROID:
2608                 case VARCHAROID:
2609                 case TEXTOID:
2610                         {
2611                                 char       *str = (char *) VARDATA(DatumGetPointer(value));
2612                                 int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
2613
2614                                 val = (char *) palloc(strlength + 1);
2615                                 memcpy(val, str, strlength);
2616                                 val[strlength] = '\0';
2617                                 break;
2618                         }
2619                 case NAMEOID:
2620                         {
2621                                 NameData   *nm = (NameData *) DatumGetPointer(value);
2622
2623                                 val = pstrdup(NameStr(*nm));
2624                                 break;
2625                         }
2626                 default:
2627
2628                         /*
2629                          * Can't get here unless someone tries to use scalarltsel on
2630                          * an operator with one string and one non-string operand.
2631                          */
2632                         elog(ERROR, "unsupported type: %u", typid);
2633                         return NULL;
2634         }
2635
2636         if (!lc_collate_is_c())
2637         {
2638                 char       *xfrmstr;
2639                 size_t          xfrmlen;
2640                 size_t          xfrmlen2;
2641
2642                 /*
2643                  * Note: originally we guessed at a suitable output buffer size,
2644                  * and only needed to call strxfrm twice if our guess was too
2645                  * small. However, it seems that some versions of Solaris have
2646                  * buggy strxfrm that can write past the specified buffer length
2647                  * in that scenario.  So, do it the dumb way for portability.
2648                  *
2649                  * Yet other systems (e.g., glibc) sometimes return a smaller value
2650                  * from the second call than the first; thus the Assert must be <=
2651                  * not == as you'd expect.  Can't any of these people program
2652                  * their way out of a paper bag?
2653                  */
2654                 xfrmlen = strxfrm(NULL, val, 0);
2655                 xfrmstr = (char *) palloc(xfrmlen + 1);
2656                 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
2657                 Assert(xfrmlen2 <= xfrmlen);
2658                 pfree(val);
2659                 val = xfrmstr;
2660         }
2661
2662         return (unsigned char *) val;
2663 }
2664
2665 /*
2666  * Do convert_to_scalar()'s work for any bytea data type.
2667  *
2668  * Very similar to convert_string_to_scalar except we can't assume
2669  * null-termination and therefore pass explicit lengths around.
2670  *
2671  * Also, assumptions about likely "normal" ranges of characters have been
2672  * removed - a data range of 0..255 is always used, for now.  (Perhaps
2673  * someday we will add information about actual byte data range to
2674  * pg_statistic.)
2675  */
2676 static void
2677 convert_bytea_to_scalar(Datum value,
2678                                                 double *scaledvalue,
2679                                                 Datum lobound,
2680                                                 double *scaledlobound,
2681                                                 Datum hibound,
2682                                                 double *scaledhibound)
2683 {
2684         int                     rangelo,
2685                                 rangehi,
2686                                 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
2687                                 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
2688                                 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
2689                                 i,
2690                                 minlen;
2691         unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
2692                            *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
2693                            *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
2694
2695         /*
2696          * Assume bytea data is uniformly distributed across all byte values.
2697          */
2698         rangelo = 0;
2699         rangehi = 255;
2700
2701         /*
2702          * Now strip any common prefix of the three strings.
2703          */
2704         minlen = Min(Min(valuelen, loboundlen), hiboundlen);
2705         for (i = 0; i < minlen; i++)
2706         {
2707                 if (*lostr != *histr || *lostr != *valstr)
2708                         break;
2709                 lostr++, histr++, valstr++;
2710                 loboundlen--, hiboundlen--, valuelen--;
2711         }
2712
2713         /*
2714          * Now we can do the conversions.
2715          */
2716         *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
2717         *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
2718         *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
2719 }
2720
2721 static double
2722 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
2723                                                         int rangelo, int rangehi)
2724 {
2725         double          num,
2726                                 denom,
2727                                 base;
2728
2729         if (valuelen <= 0)
2730                 return 0.0;                             /* empty string has scalar value 0 */
2731
2732         /*
2733          * Since base is 256, need not consider more than about 10 chars (even
2734          * this many seems like overkill)
2735          */
2736         if (valuelen > 10)
2737                 valuelen = 10;
2738
2739         /* Convert initial characters to fraction */
2740         base = rangehi - rangelo + 1;
2741         num = 0.0;
2742         denom = base;
2743         while (valuelen-- > 0)
2744         {
2745                 int                     ch = *value++;
2746
2747                 if (ch < rangelo)
2748                         ch = rangelo - 1;
2749                 else if (ch > rangehi)
2750                         ch = rangehi + 1;
2751                 num += ((double) (ch - rangelo)) / denom;
2752                 denom *= base;
2753         }
2754
2755         return num;
2756 }
2757
2758 /*
2759  * Do convert_to_scalar()'s work for any timevalue data type.
2760  */
2761 static double
2762 convert_timevalue_to_scalar(Datum value, Oid typid)
2763 {
2764         switch (typid)
2765         {
2766                 case TIMESTAMPOID:
2767                         return DatumGetTimestamp(value);
2768                 case TIMESTAMPTZOID:
2769                         return DatumGetTimestampTz(value);
2770                 case ABSTIMEOID:
2771                         return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
2772                                                                                                                  value));
2773                 case DATEOID:
2774                         return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
2775                                                                                                                  value));
2776                 case INTERVALOID:
2777                         {
2778                                 Interval   *interval = DatumGetIntervalP(value);
2779
2780                                 /*
2781                                  * Convert the month part of Interval to days using
2782                                  * assumed average month length of 365.25/12.0 days.  Not
2783                                  * too accurate, but plenty good enough for our purposes.
2784                                  */
2785 #ifdef HAVE_INT64_TIMESTAMP
2786                                 return (interval->time + (interval->month * ((365.25 / 12.0) * 86400000000.0)));
2787 #else
2788                                 return interval->time +
2789                                 interval  ->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
2790 #endif
2791                         }
2792                 case RELTIMEOID:
2793 #ifdef HAVE_INT64_TIMESTAMP
2794                         return (DatumGetRelativeTime(value) * 1000000.0);
2795 #else
2796                         return DatumGetRelativeTime(value);
2797 #endif
2798                 case TINTERVALOID:
2799                         {
2800                                 TimeInterval interval = DatumGetTimeInterval(value);
2801
2802 #ifdef HAVE_INT64_TIMESTAMP
2803                                 if (interval->status != 0)
2804                                         return ((interval->data[1] - interval->data[0]) *1000000.0);
2805 #else
2806                                 if (interval->status != 0)
2807                                         return interval->data[1] - interval->data[0];
2808 #endif
2809                                 return 0;               /* for lack of a better idea */
2810                         }
2811                 case TIMEOID:
2812                         return DatumGetTimeADT(value);
2813                 case TIMETZOID:
2814                         {
2815                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
2816
2817                                 /* use GMT-equivalent time */
2818 #ifdef HAVE_INT64_TIMESTAMP
2819                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
2820 #else
2821                                 return (double) (timetz->time + timetz->zone);
2822 #endif
2823                         }
2824         }
2825
2826         /*
2827          * Can't get here unless someone tries to use scalarltsel/scalargtsel
2828          * on an operator with one timevalue and one non-timevalue operand.
2829          */
2830         elog(ERROR, "unsupported type: %u", typid);
2831         return 0;
2832 }
2833
2834
2835 /*
2836  * get_restriction_variable
2837  *              Examine the args of a restriction clause to see if it's of the
2838  *              form (variable op pseudoconstant) or (pseudoconstant op variable),
2839  *              where "variable" could be either a Var or an expression in vars of a
2840  *              single relation.  If so, extract information about the variable,
2841  *              and also indicate which side it was on and the other argument.
2842  *
2843  * Inputs:
2844  *      root: the planner info
2845  *      args: clause argument list
2846  *      varRelid: see specs for restriction selectivity functions
2847  *
2848  * Outputs: (these are valid only if TRUE is returned)
2849  *      *vardata: gets information about variable (see examine_variable)
2850  *      *other: gets other clause argument, aggressively reduced to a constant
2851  *      *varonleft: set TRUE if variable is on the left, FALSE if on the right
2852  *
2853  * Returns TRUE if a variable is identified, otherwise FALSE.
2854  *
2855  * Note: if there are Vars on both sides of the clause, we must fail, because
2856  * callers are expecting that the other side will act like a pseudoconstant.
2857  */
2858 static bool
2859 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
2860                                                  VariableStatData *vardata, Node **other,
2861                                                  bool *varonleft)
2862 {
2863         Node       *left,
2864                            *right;
2865         VariableStatData rdata;
2866
2867         /* Fail if not a binary opclause (probably shouldn't happen) */
2868         if (list_length(args) != 2)
2869                 return false;
2870
2871         left = (Node *) linitial(args);
2872         right = (Node *) lsecond(args);
2873
2874         /*
2875          * Examine both sides.  Note that when varRelid is nonzero, Vars of
2876          * other relations will be treated as pseudoconstants.
2877          */
2878         examine_variable(root, left, varRelid, vardata);
2879         examine_variable(root, right, varRelid, &rdata);
2880
2881         /*
2882          * If one side is a variable and the other not, we win.
2883          */
2884         if (vardata->rel && rdata.rel == NULL)
2885         {
2886                 *varonleft = true;
2887                 *other = estimate_expression_value(rdata.var);
2888                 /* Assume we need no ReleaseVariableStats(rdata) here */
2889                 return true;
2890         }
2891
2892         if (vardata->rel == NULL && rdata.rel)
2893         {
2894                 *varonleft = false;
2895                 *other = estimate_expression_value(vardata->var);
2896                 /* Assume we need no ReleaseVariableStats(*vardata) here */
2897                 *vardata = rdata;
2898                 return true;
2899         }
2900
2901         /* Ooops, clause has wrong structure (probably var op var) */
2902         ReleaseVariableStats(*vardata);
2903         ReleaseVariableStats(rdata);
2904
2905         return false;
2906 }
2907
2908 /*
2909  * get_join_variables
2910  *              Apply examine_variable() to each side of a join clause.
2911  */
2912 static void
2913 get_join_variables(PlannerInfo *root, List *args,
2914                                    VariableStatData *vardata1, VariableStatData *vardata2)
2915 {
2916         Node       *left,
2917                            *right;
2918
2919         if (list_length(args) != 2)
2920                 elog(ERROR, "join operator should take two arguments");
2921
2922         left = (Node *) linitial(args);
2923         right = (Node *) lsecond(args);
2924
2925         examine_variable(root, left, 0, vardata1);
2926         examine_variable(root, right, 0, vardata2);
2927 }
2928
2929 /*
2930  * examine_variable
2931  *              Try to look up statistical data about an expression.
2932  *              Fill in a VariableStatData struct to describe the expression.
2933  *
2934  * Inputs:
2935  *      root: the planner info
2936  *      node: the expression tree to examine
2937  *      varRelid: see specs for restriction selectivity functions
2938  *
2939  * Outputs: *vardata is filled as follows:
2940  *      var: the input expression (with any binary relabeling stripped, if
2941  *              it is or contains a variable; but otherwise the type is preserved)
2942  *      rel: RelOptInfo for relation containing variable; NULL if expression
2943  *              contains no Vars (NOTE this could point to a RelOptInfo of a
2944  *              subquery, not one in the current query).
2945  *      statsTuple: the pg_statistic entry for the variable, if one exists;
2946  *              otherwise NULL.
2947  *      vartype: exposed type of the expression; this should always match
2948  *              the declared input type of the operator we are estimating for.
2949  *      atttype, atttypmod: type data to pass to get_attstatsslot().  This is
2950  *              commonly the same as the exposed type of the variable argument,
2951  *              but can be different in binary-compatible-type cases.
2952  *
2953  * Caller is responsible for doing ReleaseVariableStats() before exiting.
2954  */
2955 static void
2956 examine_variable(PlannerInfo *root, Node *node, int varRelid,
2957                                  VariableStatData *vardata)
2958 {
2959         Node       *basenode;
2960         Relids          varnos;
2961         RelOptInfo *onerel;
2962
2963         /* Make sure we don't return dangling pointers in vardata */
2964         MemSet(vardata, 0, sizeof(VariableStatData));
2965
2966         /* Save the exposed type of the expression */
2967         vardata->vartype = exprType(node);
2968
2969         /* Look inside any binary-compatible relabeling */
2970
2971         if (IsA(node, RelabelType))
2972                 basenode = (Node *) ((RelabelType *) node)->arg;
2973         else
2974                 basenode = node;
2975
2976         /* Fast path for a simple Var */
2977
2978         if (IsA(basenode, Var) &&
2979                 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
2980         {
2981                 Var                *var = (Var *) basenode;
2982                 Oid                     relid;
2983
2984                 vardata->var = basenode;        /* return Var without relabeling */
2985                 vardata->rel = find_base_rel(root, var->varno);
2986                 vardata->atttype = var->vartype;
2987                 vardata->atttypmod = var->vartypmod;
2988
2989                 relid = getrelid(var->varno, root->parse->rtable);
2990
2991                 if (OidIsValid(relid))
2992                 {
2993                         vardata->statsTuple = SearchSysCache(STATRELATT,
2994                                                                                                  ObjectIdGetDatum(relid),
2995                                                                                         Int16GetDatum(var->varattno),
2996                                                                                                  0, 0);
2997                 }
2998                 else
2999                 {
3000                         /*
3001                          * XXX This means the Var comes from a JOIN or sub-SELECT.
3002                          * Later add code to dig down into the join etc and see if we
3003                          * can trace the variable to something with stats.      (But
3004                          * beware of sub-SELECTs with DISTINCT/GROUP BY/etc.  Perhaps
3005                          * there are no cases where this would really be useful,
3006                          * because we'd have flattened the subselect if it is??)
3007                          */
3008                 }
3009
3010                 return;
3011         }
3012
3013         /*
3014          * Okay, it's a more complicated expression.  Determine variable
3015          * membership.  Note that when varRelid isn't zero, only vars of that
3016          * relation are considered "real" vars.
3017          */
3018         varnos = pull_varnos(basenode);
3019
3020         onerel = NULL;
3021
3022         switch (bms_membership(varnos))
3023         {
3024                 case BMS_EMPTY_SET:
3025                         /* No Vars at all ... must be pseudo-constant clause */
3026                         break;
3027                 case BMS_SINGLETON:
3028                         if (varRelid == 0 || bms_is_member(varRelid, varnos))
3029                         {
3030                                 onerel = find_base_rel(root,
3031                                    (varRelid ? varRelid : bms_singleton_member(varnos)));
3032                                 vardata->rel = onerel;
3033                                 node = basenode; /* strip any relabeling */
3034                         }
3035                         /* else treat it as a constant */
3036                         break;
3037                 case BMS_MULTIPLE:
3038                         if (varRelid == 0)
3039                         {
3040                                 /* treat it as a variable of a join relation */
3041                                 vardata->rel = find_join_rel(root, varnos);
3042                                 node = basenode; /* strip any relabeling */
3043                         }
3044                         else if (bms_is_member(varRelid, varnos))
3045                         {
3046                                 /* ignore the vars belonging to other relations */
3047                                 vardata->rel = find_base_rel(root, varRelid);
3048                                 node = basenode; /* strip any relabeling */
3049                                 /* note: no point in expressional-index search here */
3050                         }
3051                         /* else treat it as a constant */
3052                         break;
3053         }
3054
3055         bms_free(varnos);
3056
3057         vardata->var = node;
3058         vardata->atttype = exprType(node);
3059         vardata->atttypmod = exprTypmod(node);
3060
3061         if (onerel)
3062         {
3063                 /*
3064                  * We have an expression in vars of a single relation.  Try to
3065                  * match it to expressional index columns, in hopes of finding
3066                  * some statistics.
3067                  *
3068                  * XXX it's conceivable that there are multiple matches with
3069                  * different index opclasses; if so, we need to pick one that
3070                  * matches the operator we are estimating for.  FIXME later.
3071                  */
3072                 ListCell   *ilist;
3073
3074                 foreach(ilist, onerel->indexlist)
3075                 {
3076                         IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3077                         ListCell   *indexpr_item;
3078                         int                     pos;
3079
3080                         indexpr_item = list_head(index->indexprs);
3081                         if (indexpr_item == NULL)
3082                                 continue;               /* no expressions here... */
3083
3084                         /*
3085                          * Ignore partial indexes since they probably don't reflect
3086                          * whole-relation statistics.  Possibly reconsider this later.
3087                          */
3088                         if (index->indpred)
3089                                 continue;
3090
3091                         for (pos = 0; pos < index->ncolumns; pos++)
3092                         {
3093                                 if (index->indexkeys[pos] == 0)
3094                                 {
3095                                         Node       *indexkey;
3096
3097                                         if (indexpr_item == NULL)
3098                                                 elog(ERROR, "too few entries in indexprs list");
3099                                         indexkey = (Node *) lfirst(indexpr_item);
3100                                         if (indexkey && IsA(indexkey, RelabelType))
3101                                                 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3102                                         if (equal(node, indexkey))
3103                                         {
3104                                                 /*
3105                                                  * Found a match ... is it a unique index? Tests
3106                                                  * here should match has_unique_index().
3107                                                  */
3108                                                 if (index->unique &&
3109                                                         index->ncolumns == 1 &&
3110                                                         index->indpred == NIL)
3111                                                         vardata->isunique = true;
3112                                                 /* Has it got stats? */
3113                                                 vardata->statsTuple = SearchSysCache(STATRELATT,
3114                                                                            ObjectIdGetDatum(index->indexoid),
3115                                                                                                   Int16GetDatum(pos + 1),
3116                                                                                                                          0, 0);
3117                                                 if (vardata->statsTuple)
3118                                                         break;
3119                                         }
3120                                         indexpr_item = lnext(indexpr_item);
3121                                 }
3122                         }
3123                         if (vardata->statsTuple)
3124                                 break;
3125                 }
3126         }
3127 }
3128
3129 /*
3130  * get_variable_numdistinct
3131  *        Estimate the number of distinct values of a variable.
3132  *
3133  * vardata: results of examine_variable
3134  *
3135  * NB: be careful to produce an integral result, since callers may compare
3136  * the result to exact integer counts.
3137  */
3138 static double
3139 get_variable_numdistinct(VariableStatData *vardata)
3140 {
3141         double          stadistinct;
3142         double          ntuples;
3143
3144         /*
3145          * Determine the stadistinct value to use.      There are cases where we
3146          * can get an estimate even without a pg_statistic entry, or can get a
3147          * better value than is in pg_statistic.
3148          */
3149         if (HeapTupleIsValid(vardata->statsTuple))
3150         {
3151                 /* Use the pg_statistic entry */
3152                 Form_pg_statistic stats;
3153
3154                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3155                 stadistinct = stats->stadistinct;
3156         }
3157         else if (vardata->vartype == BOOLOID)
3158         {
3159                 /*
3160                  * Special-case boolean columns: presumably, two distinct values.
3161                  *
3162                  * Are there any other datatypes we should wire in special estimates
3163                  * for?
3164                  */
3165                 stadistinct = 2.0;
3166         }
3167         else
3168         {
3169                 /*
3170                  * We don't keep statistics for system columns, but in some cases
3171                  * we can infer distinctness anyway.
3172                  */
3173                 if (vardata->var && IsA(vardata->var, Var))
3174                 {
3175                         switch (((Var *) vardata->var)->varattno)
3176                         {
3177                                 case ObjectIdAttributeNumber:
3178                                 case SelfItemPointerAttributeNumber:
3179                                         stadistinct = -1.0; /* unique */
3180                                         break;
3181                                 case TableOidAttributeNumber:
3182                                         stadistinct = 1.0;      /* only 1 value */
3183                                         break;
3184                                 default:
3185                                         stadistinct = 0.0;      /* means "unknown" */
3186                                         break;
3187                         }
3188                 }
3189                 else
3190                         stadistinct = 0.0;      /* means "unknown" */
3191
3192                 /*
3193                  * XXX consider using estimate_num_groups on expressions?
3194                  */
3195         }
3196
3197         /*
3198          * If there is a unique index for the variable, assume it is unique no
3199          * matter what pg_statistic says (the statistics could be out of
3200          * date).  Can skip search if we already think it's unique.
3201          */
3202         if (stadistinct != -1.0)
3203         {
3204                 if (vardata->isunique)
3205                         stadistinct = -1.0;
3206                 else if (vardata->var && IsA(vardata->var, Var) &&
3207                                  vardata->rel &&
3208                                  has_unique_index(vardata->rel,
3209                                                                   ((Var *) vardata->var)->varattno))
3210                         stadistinct = -1.0;
3211         }
3212
3213         /*
3214          * If we had an absolute estimate, use that.
3215          */
3216         if (stadistinct > 0.0)
3217                 return stadistinct;
3218
3219         /*
3220          * Otherwise we need to get the relation size; punt if not available.
3221          */
3222         if (vardata->rel == NULL)
3223                 return DEFAULT_NUM_DISTINCT;
3224         ntuples = vardata->rel->tuples;
3225         if (ntuples <= 0.0)
3226                 return DEFAULT_NUM_DISTINCT;
3227
3228         /*
3229          * If we had a relative estimate, use that.
3230          */
3231         if (stadistinct < 0.0)
3232                 return floor((-stadistinct * ntuples) + 0.5);
3233
3234         /*
3235          * With no data, estimate ndistinct = ntuples if the table is small,
3236          * else use default.
3237          */
3238         if (ntuples < DEFAULT_NUM_DISTINCT)
3239                 return ntuples;
3240
3241         return DEFAULT_NUM_DISTINCT;
3242 }
3243
3244 /*
3245  * get_variable_maximum
3246  *              Estimate the maximum value of the specified variable.
3247  *              If successful, store value in *max and return TRUE.
3248  *              If no data available, return FALSE.
3249  *
3250  * sortop is the "<" comparison operator to use.  (To extract the
3251  * minimum instead of the maximum, just pass the ">" operator instead.)
3252  */
3253 static bool
3254 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3255                                          Oid sortop, Datum *max)
3256 {
3257         Datum           tmax = 0;
3258         bool            have_max = false;
3259         Form_pg_statistic stats;
3260         int16           typLen;
3261         bool            typByVal;
3262         Datum      *values;
3263         int                     nvalues;
3264         int                     i;
3265
3266         if (!HeapTupleIsValid(vardata->statsTuple))
3267         {
3268                 /* no stats available, so default result */
3269                 return false;
3270         }
3271         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3272
3273         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3274
3275         /*
3276          * If there is a histogram, grab the last or first value as
3277          * appropriate.
3278          *
3279          * If there is a histogram that is sorted with some other operator than
3280          * the one we want, fail --- this suggests that there is data we can't
3281          * use.
3282          */
3283         if (get_attstatsslot(vardata->statsTuple,
3284                                                  vardata->atttype, vardata->atttypmod,
3285                                                  STATISTIC_KIND_HISTOGRAM, sortop,
3286                                                  &values, &nvalues,
3287                                                  NULL, NULL))
3288         {
3289                 if (nvalues > 0)
3290                 {
3291                         tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3292                         have_max = true;
3293                 }
3294                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3295         }
3296         else
3297         {
3298                 Oid                     rsortop = get_commutator(sortop);
3299
3300                 if (OidIsValid(rsortop) &&
3301                         get_attstatsslot(vardata->statsTuple,
3302                                                          vardata->atttype, vardata->atttypmod,
3303                                                          STATISTIC_KIND_HISTOGRAM, rsortop,
3304                                                          &values, &nvalues,
3305                                                          NULL, NULL))
3306                 {
3307                         if (nvalues > 0)
3308                         {
3309                                 tmax = datumCopy(values[0], typByVal, typLen);
3310                                 have_max = true;
3311                         }
3312                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3313                 }
3314                 else if (get_attstatsslot(vardata->statsTuple,
3315                                                                   vardata->atttype, vardata->atttypmod,
3316                                                                   STATISTIC_KIND_HISTOGRAM, InvalidOid,
3317                                                                   &values, &nvalues,
3318                                                                   NULL, NULL))
3319                 {
3320                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3321                         return false;
3322                 }
3323         }
3324
3325         /*
3326          * If we have most-common-values info, look for a large MCV.  This is
3327          * needed even if we also have a histogram, since the histogram
3328          * excludes the MCVs.  However, usually the MCVs will not be the
3329          * extreme values, so avoid unnecessary data copying.
3330          */
3331         if (get_attstatsslot(vardata->statsTuple,
3332                                                  vardata->atttype, vardata->atttypmod,
3333                                                  STATISTIC_KIND_MCV, InvalidOid,
3334                                                  &values, &nvalues,
3335                                                  NULL, NULL))
3336         {
3337                 bool            large_mcv = false;
3338                 FmgrInfo        opproc;
3339
3340                 fmgr_info(get_opcode(sortop), &opproc);
3341
3342                 for (i = 0; i < nvalues; i++)
3343                 {
3344                         if (!have_max)
3345                         {
3346                                 tmax = values[i];
3347                                 large_mcv = have_max = true;
3348                         }
3349                         else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3350                         {
3351                                 tmax = values[i];
3352                                 large_mcv = true;
3353                         }
3354                 }
3355                 if (large_mcv)
3356                         tmax = datumCopy(tmax, typByVal, typLen);
3357                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3358         }
3359
3360         *max = tmax;
3361         return have_max;
3362 }
3363
3364
3365 /*-------------------------------------------------------------------------
3366  *
3367  * Pattern analysis functions
3368  *
3369  * These routines support analysis of LIKE and regular-expression patterns
3370  * by the planner/optimizer.  It's important that they agree with the
3371  * regular-expression code in backend/regex/ and the LIKE code in
3372  * backend/utils/adt/like.c.
3373  *
3374  * Note that the prefix-analysis functions are called from
3375  * backend/optimizer/path/indxpath.c as well as from routines in this file.
3376  *
3377  *-------------------------------------------------------------------------
3378  */
3379
3380 /*
3381  * Extract the fixed prefix, if any, for a pattern.
3382  *
3383  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3384  *      or to NULL if no fixed prefix exists for the pattern.
3385  * *rest is set to a palloc'd Const representing the remainder of the pattern
3386  *      after the portion describing the fixed prefix.
3387  * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3388  *
3389  * The return value distinguishes no fixed prefix, a partial prefix,
3390  * or an exact-match-only pattern.
3391  */
3392
3393 static Pattern_Prefix_Status
3394 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3395                                   Const **prefix_const, Const **rest_const)
3396 {
3397         char       *match;
3398         char       *patt;
3399         int                     pattlen;
3400         char       *rest;
3401         Oid                     typeid = patt_const->consttype;
3402         int                     pos,
3403                                 match_pos;
3404
3405         /* the right-hand const is type text or bytea */
3406         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3407
3408         if (typeid == BYTEAOID && case_insensitive)
3409                 ereport(ERROR,
3410                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3411                 errmsg("case insensitive matching not supported on type bytea")));
3412
3413         if (typeid != BYTEAOID)
3414         {
3415                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3416                 pattlen = strlen(patt);
3417         }
3418         else
3419         {
3420                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
3421
3422                 pattlen = VARSIZE(bstr) - VARHDRSZ;
3423                 if (pattlen > 0)
3424                 {
3425                         patt = (char *) palloc(pattlen);
3426                         memcpy(patt, VARDATA(bstr), pattlen);
3427                 }
3428                 else
3429                         patt = NULL;
3430
3431                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3432                         pfree(bstr);
3433         }
3434
3435         match = palloc(pattlen + 1);
3436         match_pos = 0;
3437         for (pos = 0; pos < pattlen; pos++)
3438         {
3439                 /* % and _ are wildcard characters in LIKE */
3440                 if (patt[pos] == '%' ||
3441                         patt[pos] == '_')
3442                         break;
3443
3444                 /* Backslash escapes the next character */
3445                 if (patt[pos] == '\\')
3446                 {
3447                         pos++;
3448                         if (patt[pos] == '\0' && typeid != BYTEAOID)
3449                                 break;
3450                 }
3451
3452                 /*
3453                  * XXX I suspect isalpha() is not an adequately locale-sensitive
3454                  * test for characters that can vary under case folding?
3455                  */
3456                 if (case_insensitive && isalpha((unsigned char) patt[pos]))
3457                         break;
3458
3459                 /*
3460                  * NOTE: this code used to think that %% meant a literal %, but
3461                  * textlike() itself does not think that, and the SQL92 spec
3462                  * doesn't say any such thing either.
3463                  */
3464                 match[match_pos++] = patt[pos];
3465         }
3466
3467         match[match_pos] = '\0';
3468         rest = &patt[pos];
3469
3470         if (typeid != BYTEAOID)
3471         {
3472                 *prefix_const = string_to_const(match, typeid);
3473                 *rest_const = string_to_const(rest, typeid);
3474         }
3475         else
3476         {
3477                 *prefix_const = string_to_bytea_const(match, match_pos);
3478                 *rest_const = string_to_bytea_const(rest, pattlen - pos);
3479         }
3480
3481         if (patt != NULL)
3482                 pfree(patt);
3483         pfree(match);
3484
3485         /* in LIKE, an empty pattern is an exact match! */
3486         if (pos == pattlen)
3487                 return Pattern_Prefix_Exact;    /* reached end of pattern, so
3488                                                                                  * exact */
3489
3490         if (match_pos > 0)
3491                 return Pattern_Prefix_Partial;
3492
3493         return Pattern_Prefix_None;
3494 }
3495
3496 static Pattern_Prefix_Status
3497 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3498                                    Const **prefix_const, Const **rest_const)
3499 {
3500         char       *match;
3501         int                     pos,
3502                                 match_pos,
3503                                 prev_pos,
3504                                 prev_match_pos,
3505                                 paren_depth;
3506         char       *patt;
3507         char       *rest;
3508         Oid                     typeid = patt_const->consttype;
3509
3510         /*
3511          * Should be unnecessary, there are no bytea regex operators defined.
3512          * As such, it should be noted that the rest of this function has *not*
3513          * been made safe for binary (possibly NULL containing) strings.
3514          */
3515         if (typeid == BYTEAOID)
3516                 ereport(ERROR,
3517                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3518                                  errmsg("regular-expression matching not supported on type bytea")));
3519
3520         /* the right-hand const is type text for all of these */
3521         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3522
3523         /* Pattern must be anchored left */
3524         if (patt[0] != '^')
3525         {
3526                 rest = patt;
3527
3528                 *prefix_const = NULL;
3529                 *rest_const = string_to_const(rest, typeid);
3530
3531                 return Pattern_Prefix_None;
3532         }
3533
3534         /*
3535          * If unquoted | is present at paren level 0 in pattern, then there
3536          * are multiple alternatives for the start of the string.
3537          */
3538         paren_depth = 0;
3539         for (pos = 1; patt[pos]; pos++)
3540         {
3541                 if (patt[pos] == '|' && paren_depth == 0)
3542                 {
3543                         rest = patt;
3544
3545                         *prefix_const = NULL;
3546                         *rest_const = string_to_const(rest, typeid);
3547
3548                         return Pattern_Prefix_None;
3549                 }
3550                 else if (patt[pos] == '(')
3551                         paren_depth++;
3552                 else if (patt[pos] == ')' && paren_depth > 0)
3553                         paren_depth--;
3554                 else if (patt[pos] == '\\')
3555                 {
3556                         /* backslash quotes the next character */
3557                         pos++;
3558                         if (patt[pos] == '\0')
3559                                 break;
3560                 }
3561         }
3562
3563         /* OK, allocate space for pattern */
3564         match = palloc(strlen(patt) + 1);
3565         prev_match_pos = match_pos = 0;
3566
3567         /* note start at pos 1 to skip leading ^ */
3568         for (prev_pos = pos = 1; patt[pos]; )
3569         {
3570                 int             len;
3571
3572                 /*
3573                  * Check for characters that indicate multiple possible matches
3574                  * here. XXX I suspect isalpha() is not an adequately
3575                  * locale-sensitive test for characters that can vary under case
3576                  * folding?
3577                  */
3578                 if (patt[pos] == '.' ||
3579                         patt[pos] == '(' ||
3580                         patt[pos] == '[' ||
3581                         patt[pos] == '$' ||
3582                         (case_insensitive && isalpha((unsigned char) patt[pos])))
3583                         break;
3584
3585                 /*
3586                  * In AREs, backslash followed by alphanumeric is an escape, not
3587                  * a quoted character.  Must treat it as having multiple possible
3588                  * matches.
3589                  */
3590                 if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
3591                         break;
3592
3593                 /*
3594                  * Check for quantifiers.  Except for +, this means the preceding
3595                  * character is optional, so we must remove it from the prefix
3596                  * too!
3597                  */
3598                 if (patt[pos] == '*' ||
3599                         patt[pos] == '?' ||
3600                         patt[pos] == '{')
3601                 {
3602                         match_pos = prev_match_pos;
3603                         pos = prev_pos;
3604                         break;
3605                 }
3606                 if (patt[pos] == '+')
3607                 {
3608                         pos = prev_pos;
3609                         break;
3610                 }
3611                 if (patt[pos] == '\\')
3612                 {
3613                         /* backslash quotes the next character */
3614                         pos++;
3615                         if (patt[pos] == '\0')
3616                                 break;
3617                 }
3618                 /* save position in case we need to back up on next loop cycle */
3619                 prev_match_pos = match_pos;
3620                 prev_pos = pos;
3621                 /* must use encoding-aware processing here */
3622                 len = pg_mblen(&patt[pos]);
3623                 memcpy(&match[match_pos], &patt[pos], len);
3624                 match_pos += len;
3625                 pos += len;
3626         }
3627
3628         match[match_pos] = '\0';
3629         rest = &patt[pos];
3630
3631         if (patt[pos] == '$' && patt[pos + 1] == '\0')
3632         {
3633                 rest = &patt[pos + 1];
3634
3635                 *prefix_const = string_to_const(match, typeid);
3636                 *rest_const = string_to_const(rest, typeid);
3637
3638                 pfree(patt);
3639                 pfree(match);
3640
3641                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
3642         }
3643
3644         *prefix_const = string_to_const(match, typeid);
3645         *rest_const = string_to_const(rest, typeid);
3646
3647         pfree(patt);
3648         pfree(match);
3649
3650         if (match_pos > 0)
3651                 return Pattern_Prefix_Partial;
3652
3653         return Pattern_Prefix_None;
3654 }
3655
3656 Pattern_Prefix_Status
3657 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
3658                                          Const **prefix, Const **rest)
3659 {
3660         Pattern_Prefix_Status result;
3661
3662         switch (ptype)
3663         {
3664                 case Pattern_Type_Like:
3665                         result = like_fixed_prefix(patt, false, prefix, rest);
3666                         break;
3667                 case Pattern_Type_Like_IC:
3668                         result = like_fixed_prefix(patt, true, prefix, rest);
3669                         break;
3670                 case Pattern_Type_Regex:
3671                         result = regex_fixed_prefix(patt, false, prefix, rest);
3672                         break;
3673                 case Pattern_Type_Regex_IC:
3674                         result = regex_fixed_prefix(patt, true, prefix, rest);
3675                         break;
3676                 default:
3677                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
3678                         result = Pattern_Prefix_None;           /* keep compiler quiet */
3679                         break;
3680         }
3681         return result;
3682 }
3683
3684 /*
3685  * Estimate the selectivity of a fixed prefix for a pattern match.
3686  *
3687  * A fixed prefix "foo" is estimated as the selectivity of the expression
3688  * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
3689  *
3690  * We use the >= and < operators from the specified btree opclass to do the
3691  * estimation.  The given variable and Const must be of the associated
3692  * datatype.
3693  *
3694  * XXX Note: we make use of the upper bound to estimate operator selectivity
3695  * even if the locale is such that we cannot rely on the upper-bound string.
3696  * The selectivity only needs to be approximately right anyway, so it seems
3697  * more useful to use the upper-bound code than not.
3698  */
3699 static Selectivity
3700 prefix_selectivity(PlannerInfo *root, Node *variable,
3701                                    Oid opclass, Const *prefixcon)
3702 {
3703         Selectivity prefixsel;
3704         Oid                     cmpopr;
3705         List       *cmpargs;
3706         Const      *greaterstrcon;
3707
3708         cmpopr = get_opclass_member(opclass, InvalidOid,
3709                                                                 BTGreaterEqualStrategyNumber);
3710         if (cmpopr == InvalidOid)
3711                 elog(ERROR, "no >= operator for opclass %u", opclass);
3712         cmpargs = list_make2(variable, prefixcon);
3713         /* Assume scalargtsel is appropriate for all supported types */
3714         prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
3715                                                                                                    PointerGetDatum(root),
3716                                                                                                 ObjectIdGetDatum(cmpopr),
3717                                                                                                 PointerGetDatum(cmpargs),
3718                                                                                                    Int32GetDatum(0)));
3719
3720         /*-------
3721          * If we can create a string larger than the prefix, say
3722          *      "x < greaterstr".
3723          *-------
3724          */
3725         greaterstrcon = make_greater_string(prefixcon);
3726         if (greaterstrcon)
3727         {
3728                 Selectivity topsel;
3729
3730                 cmpopr = get_opclass_member(opclass, InvalidOid,
3731                                                                         BTLessStrategyNumber);
3732                 if (cmpopr == InvalidOid)
3733                         elog(ERROR, "no < operator for opclass %u", opclass);
3734                 cmpargs = list_make2(variable, greaterstrcon);
3735                 /* Assume scalarltsel is appropriate for all supported types */
3736                 topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
3737                                                                                                         PointerGetDatum(root),
3738                                                                                                 ObjectIdGetDatum(cmpopr),
3739                                                                                                 PointerGetDatum(cmpargs),
3740                                                                                                         Int32GetDatum(0)));
3741
3742                 /*
3743                  * Merge the two selectivities in the same way as for a range
3744                  * query (see clauselist_selectivity()).
3745                  */
3746                 prefixsel = topsel + prefixsel - 1.0;
3747
3748                 /* Adjust for double-exclusion of NULLs */
3749                 prefixsel += nulltestsel(root, IS_NULL, variable, 0);
3750
3751                 /*
3752                  * A zero or slightly negative prefixsel should be converted into
3753                  * a small positive value; we probably are dealing with a very
3754                  * tight range and got a bogus result due to roundoff errors.
3755                  * However, if prefixsel is very negative, then we probably have
3756                  * default selectivity estimates on one or both sides of the
3757                  * range.  In that case, insert a not-so-wildly-optimistic default
3758                  * estimate.
3759                  */
3760                 if (prefixsel <= 0.0)
3761                 {
3762                         if (prefixsel < -0.01)
3763                         {
3764                                 /*
3765                                  * No data available --- use a default estimate that is
3766                                  * small, but not real small.
3767                                  */
3768                                 prefixsel = 0.005;
3769                         }
3770                         else
3771                         {
3772                                 /*
3773                                  * It's just roundoff error; use a small positive value
3774                                  */
3775                                 prefixsel = 1.0e-10;
3776                         }
3777                 }
3778         }
3779
3780         return prefixsel;
3781 }
3782
3783
3784 /*
3785  * Estimate the selectivity of a pattern of the specified type.
3786  * Note that any fixed prefix of the pattern will have been removed already.
3787  *
3788  * For now, we use a very simplistic approach: fixed characters reduce the
3789  * selectivity a good deal, character ranges reduce it a little,
3790  * wildcards (such as % for LIKE or .* for regex) increase it.
3791  */
3792
3793 #define FIXED_CHAR_SEL  0.20    /* about 1/5 */
3794 #define CHAR_RANGE_SEL  0.25
3795 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match
3796                                                                  * end-of-string */
3797 #define FULL_WILDCARD_SEL 5.0
3798 #define PARTIAL_WILDCARD_SEL 2.0
3799
3800 static Selectivity
3801 like_selectivity(Const *patt_const, bool case_insensitive)
3802 {
3803         Selectivity sel = 1.0;
3804         int                     pos;
3805         int                     start;
3806         Oid                     typeid = patt_const->consttype;
3807         char       *patt;
3808         int                     pattlen;
3809
3810         /* the right-hand const is type text or bytea */
3811         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3812
3813         if (typeid == BYTEAOID && case_insensitive)
3814                 ereport(ERROR,
3815                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3816                 errmsg("case insensitive matching not supported on type bytea")));
3817
3818         if (typeid != BYTEAOID)
3819         {
3820                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3821                 pattlen = strlen(patt);
3822         }
3823         else
3824         {
3825                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
3826
3827                 pattlen = VARSIZE(bstr) - VARHDRSZ;
3828                 if (pattlen > 0)
3829                 {
3830                         patt = (char *) palloc(pattlen);
3831                         memcpy(patt, VARDATA(bstr), pattlen);
3832                 }
3833                 else
3834                         patt = NULL;
3835
3836                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3837                         pfree(bstr);
3838         }
3839         /* patt should never be NULL in practice */
3840         Assert(patt != NULL);
3841
3842         /* Skip any leading %; it's already factored into initial sel */
3843         start = (*patt == '%') ? 1 : 0;
3844         for (pos = start; pos < pattlen; pos++)
3845         {
3846                 /* % and _ are wildcard characters in LIKE */
3847                 if (patt[pos] == '%')
3848                         sel *= FULL_WILDCARD_SEL;
3849                 else if (patt[pos] == '_')
3850                         sel *= ANY_CHAR_SEL;
3851                 else if (patt[pos] == '\\')
3852                 {
3853                         /* Backslash quotes the next character */
3854                         pos++;
3855                         if (patt[pos] == '\0' && typeid != BYTEAOID)
3856                                 break;
3857                         sel *= FIXED_CHAR_SEL;
3858                 }
3859                 else
3860                         sel *= FIXED_CHAR_SEL;
3861         }
3862         /* Could get sel > 1 if multiple wildcards */
3863         if (sel > 1.0)
3864                 sel = 1.0;
3865         return sel;
3866 }
3867
3868 static Selectivity
3869 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
3870 {
3871         Selectivity sel = 1.0;
3872         int                     paren_depth = 0;
3873         int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
3874         int                     pos;
3875
3876         for (pos = 0; pos < pattlen; pos++)
3877         {
3878                 if (patt[pos] == '(')
3879                 {
3880                         if (paren_depth == 0)
3881                                 paren_pos = pos;        /* remember start of parenthesized item */
3882                         paren_depth++;
3883                 }
3884                 else if (patt[pos] == ')' && paren_depth > 0)
3885                 {
3886                         paren_depth--;
3887                         if (paren_depth == 0)
3888                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
3889                                                                                          pos - (paren_pos + 1),
3890                                                                                          case_insensitive);
3891                 }
3892                 else if (patt[pos] == '|' && paren_depth == 0)
3893                 {
3894                         /*
3895                          * If unquoted | is present at paren level 0 in pattern, we
3896                          * have multiple alternatives; sum their probabilities.
3897                          */
3898                         sel += regex_selectivity_sub(patt + (pos + 1),
3899                                                                                  pattlen - (pos + 1),
3900                                                                                  case_insensitive);
3901                         break;                          /* rest of pattern is now processed */
3902                 }
3903                 else if (patt[pos] == '[')
3904                 {
3905                         bool            negclass = false;
3906
3907                         if (patt[++pos] == '^')
3908                         {
3909                                 negclass = true;
3910                                 pos++;
3911                         }
3912                         if (patt[pos] == ']')           /* ']' at start of class is not
3913                                                                                  * special */
3914                                 pos++;
3915                         while (pos < pattlen && patt[pos] != ']')
3916                                 pos++;
3917                         if (paren_depth == 0)
3918                                 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
3919                 }
3920                 else if (patt[pos] == '.')
3921                 {
3922                         if (paren_depth == 0)
3923                                 sel *= ANY_CHAR_SEL;
3924                 }
3925                 else if (patt[pos] == '*' ||
3926                                  patt[pos] == '?' ||
3927                                  patt[pos] == '+')
3928                 {
3929                         /* Ought to be smarter about quantifiers... */
3930                         if (paren_depth == 0)
3931                                 sel *= PARTIAL_WILDCARD_SEL;
3932                 }
3933                 else if (patt[pos] == '{')
3934                 {
3935                         while (pos < pattlen && patt[pos] != '}')
3936                                 pos++;
3937                         if (paren_depth == 0)
3938                                 sel *= PARTIAL_WILDCARD_SEL;
3939                 }
3940                 else if (patt[pos] == '\\')
3941                 {
3942                         /* backslash quotes the next character */
3943                         pos++;
3944                         if (pos >= pattlen)
3945                                 break;
3946                         if (paren_depth == 0)
3947                                 sel *= FIXED_CHAR_SEL;
3948                 }
3949                 else
3950                 {
3951                         if (paren_depth == 0)
3952                                 sel *= FIXED_CHAR_SEL;
3953                 }
3954         }
3955         /* Could get sel > 1 if multiple wildcards */
3956         if (sel > 1.0)
3957                 sel = 1.0;
3958         return sel;
3959 }
3960
3961 static Selectivity
3962 regex_selectivity(Const *patt_const, bool case_insensitive)
3963 {
3964         Selectivity sel;
3965         char       *patt;
3966         int                     pattlen;
3967         Oid                     typeid = patt_const->consttype;
3968
3969         /*
3970          * Should be unnecessary, there are no bytea regex operators defined.
3971          * As such, it should be noted that the rest of this function has *not*
3972          * been made safe for binary (possibly NULL containing) strings.
3973          */
3974         if (typeid == BYTEAOID)
3975                 ereport(ERROR,
3976                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3977                                  errmsg("regular-expression matching not supported on type bytea")));
3978
3979         /* the right-hand const is type text for all of these */
3980         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3981         pattlen = strlen(patt);
3982
3983         /* If patt doesn't end with $, consider it to have a trailing wildcard */
3984         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
3985                 (pattlen == 1 || patt[pattlen - 2] != '\\'))
3986         {
3987                 /* has trailing $ */
3988                 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
3989         }
3990         else
3991         {
3992                 /* no trailing $ */
3993                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
3994                 sel *= FULL_WILDCARD_SEL;
3995                 if (sel > 1.0)
3996                         sel = 1.0;
3997         }
3998         return sel;
3999 }
4000
4001 static Selectivity
4002 pattern_selectivity(Const *patt, Pattern_Type ptype)
4003 {
4004         Selectivity result;
4005
4006         switch (ptype)
4007         {
4008                 case Pattern_Type_Like:
4009                         result = like_selectivity(patt, false);
4010                         break;
4011                 case Pattern_Type_Like_IC:
4012                         result = like_selectivity(patt, true);
4013                         break;
4014                 case Pattern_Type_Regex:
4015                         result = regex_selectivity(patt, false);
4016                         break;
4017                 case Pattern_Type_Regex_IC:
4018                         result = regex_selectivity(patt, true);
4019                         break;
4020                 default:
4021                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4022                         result = 1.0;           /* keep compiler quiet */
4023                         break;
4024         }
4025         return result;
4026 }
4027
4028
4029 /*
4030  * Try to generate a string greater than the given string or any
4031  * string it is a prefix of.  If successful, return a palloc'd string
4032  * in the form of a Const pointer; else return NULL.
4033  *
4034  * The key requirement here is that given a prefix string, say "foo",
4035  * we must be able to generate another string "fop" that is greater
4036  * than all strings "foobar" starting with "foo".
4037  *
4038  * If we max out the righthand byte, truncate off the last character
4039  * and start incrementing the next.  For example, if "z" were the last
4040  * character in the sort order, then we could produce "foo" as a
4041  * string greater than "fonz".
4042  *
4043  * This could be rather slow in the worst case, but in most cases we
4044  * won't have to try more than one or two strings before succeeding.
4045  *
4046  * NOTE: at present this assumes we are in the C locale, so that simple
4047  * bytewise comparison applies.  However, we might be in a multibyte
4048  * encoding such as UTF8, so we do have to watch out for generating
4049  * invalid encoding sequences.
4050  */
4051 Const *
4052 make_greater_string(const Const *str_const)
4053 {
4054         Oid                     datatype = str_const->consttype;
4055         char       *workstr;
4056         int                     len;
4057
4058         /* Get the string and a modifiable copy */
4059         if (datatype == NAMEOID)
4060         {
4061                 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4062                                                                                                  str_const->constvalue));
4063                 len = strlen(workstr);
4064         }
4065         else if (datatype == BYTEAOID)
4066         {
4067                 bytea      *bstr = DatumGetByteaP(str_const->constvalue);
4068
4069                 len = VARSIZE(bstr) - VARHDRSZ;
4070                 if (len > 0)
4071                 {
4072                         workstr = (char *) palloc(len);
4073                         memcpy(workstr, VARDATA(bstr), len);
4074                 }
4075                 else
4076                         workstr = NULL;
4077
4078                 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4079                         pfree(bstr);
4080         }
4081         else
4082         {
4083                 workstr = DatumGetCString(DirectFunctionCall1(textout,
4084                                                                                                  str_const->constvalue));
4085                 len = strlen(workstr);
4086         }
4087
4088         while (len > 0)
4089         {
4090                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4091                 unsigned char savelastchar = *lastchar;
4092
4093                 /*
4094                  * Try to generate a larger string by incrementing the last byte.
4095                  */
4096                 while (*lastchar < (unsigned char) 255)
4097                 {
4098                         Const      *workstr_const;
4099
4100                         (*lastchar)++;
4101
4102                         if (datatype != BYTEAOID)
4103                         {
4104                                 /* do not generate invalid encoding sequences */
4105                                 if (!pg_verifymbstr((const unsigned char *) workstr,
4106                                                                         len, true))
4107                                         continue;
4108                                 workstr_const = string_to_const(workstr, datatype);
4109                         }
4110                         else
4111                                 workstr_const = string_to_bytea_const(workstr, len);
4112
4113                         pfree(workstr);
4114                         return workstr_const;
4115                 }
4116
4117                 /* restore last byte so we don't confuse pg_mbcliplen */
4118                 *lastchar = savelastchar;
4119
4120                 /*
4121                  * Truncate off the last character, which might be more than 1
4122                  * byte, depending on the character encoding.
4123                  */
4124                 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4125                         len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
4126                 else
4127                         len -= 1;
4128
4129                 if (datatype != BYTEAOID)
4130                         workstr[len] = '\0';
4131         }
4132
4133         /* Failed... */
4134         if (workstr != NULL)
4135                 pfree(workstr);
4136
4137         return NULL;
4138 }
4139
4140 /*
4141  * Generate a Datum of the appropriate type from a C string.
4142  * Note that all of the supported types are pass-by-ref, so the
4143  * returned value should be pfree'd if no longer needed.
4144  */
4145 static Datum
4146 string_to_datum(const char *str, Oid datatype)
4147 {
4148         Assert(str != NULL);
4149
4150         /*
4151          * We cheat a little by assuming that textin() will do for bpchar and
4152          * varchar constants too...
4153          */
4154         if (datatype == NAMEOID)
4155                 return DirectFunctionCall1(namein, CStringGetDatum(str));
4156         else if (datatype == BYTEAOID)
4157                 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4158         else
4159                 return DirectFunctionCall1(textin, CStringGetDatum(str));
4160 }
4161
4162 /*
4163  * Generate a Const node of the appropriate type from a C string.
4164  */
4165 static Const *
4166 string_to_const(const char *str, Oid datatype)
4167 {
4168         Datum           conval = string_to_datum(str, datatype);
4169
4170         return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4171                                          conval, false, false);
4172 }
4173
4174 /*
4175  * Generate a Const node of bytea type from a binary C string and a length.
4176  */
4177 static Const *
4178 string_to_bytea_const(const char *str, size_t str_len)
4179 {
4180         bytea      *bstr = palloc(VARHDRSZ + str_len);
4181         Datum           conval;
4182
4183         memcpy(VARDATA(bstr), str, str_len);
4184         VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4185         conval = PointerGetDatum(bstr);
4186
4187         return makeConst(BYTEAOID, -1, conval, false, false);
4188 }
4189
4190 /*-------------------------------------------------------------------------
4191  *
4192  * Index cost estimation functions
4193  *
4194  * genericcostestimate is a general-purpose estimator for use when we
4195  * don't have any better idea about how to estimate.  Index-type-specific
4196  * knowledge can be incorporated in the type-specific routines.
4197  *
4198  *-------------------------------------------------------------------------
4199  */
4200
4201 static void
4202 genericcostestimate(PlannerInfo *root,
4203                                         IndexOptInfo *index, List *indexQuals,
4204                                         Cost *indexStartupCost,
4205                                         Cost *indexTotalCost,
4206                                         Selectivity *indexSelectivity,
4207                                         double *indexCorrelation)
4208 {
4209         double          numIndexTuples;
4210         double          numIndexPages;
4211         QualCost        index_qual_cost;
4212         double          qual_op_cost;
4213         double          qual_arg_cost;
4214         List       *selectivityQuals;
4215
4216         /*
4217          * If the index is partial, AND the index predicate with the
4218          * explicitly given indexquals to produce a more accurate idea of the
4219          * index selectivity.  This may produce redundant clauses.      We get rid
4220          * of exact duplicates in the code below.  We expect that most cases
4221          * of partial redundancy (such as "x < 4" from the qual and "x < 5"
4222          * from the predicate) will be recognized and handled correctly by
4223          * clauselist_selectivity().  This assumption is somewhat fragile,
4224          * since it depends on pred_test() and clauselist_selectivity() having
4225          * similar capabilities, and there are certainly many cases where we
4226          * will end up with a too-low selectivity estimate.  This will bias
4227          * the system in favor of using partial indexes where possible, which
4228          * is not necessarily a bad thing.      But it'd be nice to do better
4229          * someday.
4230          *
4231          * Note that index->indpred and indexQuals are both in implicit-AND form,
4232          * so ANDing them together just takes merging the lists.  However,
4233          * eliminating duplicates is a bit trickier because indexQuals
4234          * contains RestrictInfo nodes and the indpred does not.  It is okay
4235          * to pass a mixed list to clauselist_selectivity, but we have to work
4236          * a bit to generate a list without logical duplicates.  (We could
4237          * just list_union indpred and strippedQuals, but then we'd not get
4238          * caching of per-qual selectivity estimates.)
4239          */
4240         if (index->indpred != NIL)
4241         {
4242                 List       *strippedQuals;
4243                 List       *predExtraQuals;
4244
4245                 strippedQuals = get_actual_clauses(indexQuals);
4246                 predExtraQuals = list_difference(index->indpred, strippedQuals);
4247                 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4248         }
4249         else
4250                 selectivityQuals = indexQuals;
4251
4252         /* Estimate the fraction of main-table tuples that will be visited */
4253         *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4254                                                                                            index->rel->relid,
4255                                                                                            JOIN_INNER);
4256
4257         /*
4258          * Estimate the number of tuples that will be visited.  We do it in
4259          * this rather peculiar-looking way in order to get the right answer
4260          * for partial indexes.  We can bound the number of tuples by the
4261          * index size, in any case.
4262          */
4263         numIndexTuples = *indexSelectivity * index->rel->tuples;
4264
4265         if (numIndexTuples > index->tuples)
4266                 numIndexTuples = index->tuples;
4267
4268         /*
4269          * Always estimate at least one tuple is touched, even when
4270          * indexSelectivity estimate is tiny.
4271          */
4272         if (numIndexTuples < 1.0)
4273                 numIndexTuples = 1.0;
4274
4275         /*
4276          * Estimate the number of index pages that will be retrieved.
4277          *
4278          * For all currently-supported index types, the first page of the index
4279          * is a metadata page, and we should figure on fetching that plus a
4280          * pro-rated fraction of the remaining pages.
4281          */
4282         if (index->pages > 1 && index->tuples > 0)
4283         {
4284                 numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
4285                 numIndexPages += 1;             /* count the metapage too */
4286                 numIndexPages = ceil(numIndexPages);
4287         }
4288         else
4289                 numIndexPages = 1.0;
4290
4291         /*
4292          * Compute the index access cost.
4293          *
4294          * Disk cost: our generic assumption is that the index pages will be read
4295          * sequentially, so they have cost 1.0 each, not random_page_cost.
4296          */
4297         *indexTotalCost = numIndexPages;
4298
4299         /*
4300          * CPU cost: any complex expressions in the indexquals will need to be
4301          * evaluated once at the start of the scan to reduce them to runtime
4302          * keys to pass to the index AM (see nodeIndexscan.c).  We model the
4303          * per-tuple CPU costs as cpu_index_tuple_cost plus one
4304          * cpu_operator_cost per indexqual operator.
4305          *
4306          * Note: this neglects the possible costs of rechecking lossy operators
4307          * and OR-clause expressions.  Detecting that that might be needed
4308          * seems more expensive than it's worth, though, considering all the
4309          * other inaccuracies here ...
4310          */
4311         cost_qual_eval(&index_qual_cost, indexQuals);
4312         qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4313         qual_arg_cost = index_qual_cost.startup +
4314                 index_qual_cost.per_tuple - qual_op_cost;
4315         if (qual_arg_cost < 0)          /* just in case... */
4316                 qual_arg_cost = 0;
4317         *indexStartupCost = qual_arg_cost;
4318         *indexTotalCost += qual_arg_cost;
4319         *indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost);
4320
4321         /*
4322          * Generic assumption about index correlation: there isn't any.
4323          */
4324         *indexCorrelation = 0.0;
4325 }
4326
4327
4328 Datum
4329 btcostestimate(PG_FUNCTION_ARGS)
4330 {
4331         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4332         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4333         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4334         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4335         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4336         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4337         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4338         Oid                     relid;
4339         AttrNumber      colnum;
4340         HeapTuple       tuple;
4341
4342         genericcostestimate(root, index, indexQuals,
4343                                                 indexStartupCost, indexTotalCost,
4344                                                 indexSelectivity, indexCorrelation);
4345
4346         /*
4347          * If we can get an estimate of the first column's ordering
4348          * correlation C from pg_statistic, estimate the index correlation as
4349          * C for a single-column index, or C * 0.75 for multiple columns.
4350          * (The idea here is that multiple columns dilute the importance of
4351          * the first column's ordering, but don't negate it entirely.  Before
4352          * 8.0 we divided the correlation by the number of columns, but that
4353          * seems too strong.)
4354          */
4355         if (index->indexkeys[0] != 0)
4356         {
4357                 /* Simple variable --- look to stats for the underlying table */
4358                 relid = getrelid(index->rel->relid, root->parse->rtable);
4359                 Assert(relid != InvalidOid);
4360                 colnum = index->indexkeys[0];
4361         }
4362         else
4363         {
4364                 /* Expression --- maybe there are stats for the index itself */
4365                 relid = index->indexoid;
4366                 colnum = 1;
4367         }
4368
4369         tuple = SearchSysCache(STATRELATT,
4370                                                    ObjectIdGetDatum(relid),
4371                                                    Int16GetDatum(colnum),
4372                                                    0, 0);
4373
4374         if (HeapTupleIsValid(tuple))
4375         {
4376                 Oid                     typid;
4377                 int32           typmod;
4378                 float4     *numbers;
4379                 int                     nnumbers;
4380
4381                 /* XXX this code would break with different storage type */
4382                 get_atttypetypmod(relid, colnum, &typid, &typmod);
4383
4384                 if (get_attstatsslot(tuple, typid, typmod,
4385                                                          STATISTIC_KIND_CORRELATION,
4386                                                          index->ordering[0],
4387                                                          NULL, NULL, &numbers, &nnumbers))
4388                 {
4389                         double          varCorrelation;
4390
4391                         Assert(nnumbers == 1);
4392                         varCorrelation = numbers[0];
4393
4394                         if (index->ncolumns > 1)
4395                                 *indexCorrelation = varCorrelation * 0.75;
4396                         else
4397                                 *indexCorrelation = varCorrelation;
4398
4399                         free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
4400                 }
4401                 ReleaseSysCache(tuple);
4402         }
4403
4404         PG_RETURN_VOID();
4405 }
4406
4407 Datum
4408 rtcostestimate(PG_FUNCTION_ARGS)
4409 {
4410         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4411         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4412         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4413         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4414         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4415         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4416         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4417
4418         genericcostestimate(root, index, indexQuals,
4419                                                 indexStartupCost, indexTotalCost,
4420                                                 indexSelectivity, indexCorrelation);
4421
4422         PG_RETURN_VOID();
4423 }
4424
4425 Datum
4426 hashcostestimate(PG_FUNCTION_ARGS)
4427 {
4428         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4429         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4430         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4431         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4432         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4433         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4434         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4435
4436         genericcostestimate(root, index, indexQuals,
4437                                                 indexStartupCost, indexTotalCost,
4438                                                 indexSelectivity, indexCorrelation);
4439
4440         PG_RETURN_VOID();
4441 }
4442
4443 Datum
4444 gistcostestimate(PG_FUNCTION_ARGS)
4445 {
4446         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4447         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4448         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4449         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4450         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4451         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4452         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4453
4454         genericcostestimate(root, index, indexQuals,
4455                                                 indexStartupCost, indexTotalCost,
4456                                                 indexSelectivity, indexCorrelation);
4457
4458         PG_RETURN_VOID();
4459 }