granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * selfuncs.c
   4  *        Selectivity functions and index cost estimation functions for
   5  *        standard operators and index access methods.
   6  *
   7  *        Selectivity routines are registered in the pg_operator catalog
   8  *        in the "oprrest" and "oprjoin" attributes.
   9  *
  10  *        Index cost functions are registered in the pg_am catalog
  11  *        in the "amcostestimate" attribute.
  12  *
  13  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
  14  * Portions Copyright (c) 1994, Regents of the University of California
  15  *
  16  *
  17  * IDENTIFICATION
  18  *        $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.66 2000/05/26 17:19:15 tgl Exp $
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22
  23 #include "postgres.h"
  24
  25 #include <ctype.h>
  26 #include <math.h>
  27
  28 #include "access/heapam.h"
  29 #include "catalog/catname.h"
  30 #include "catalog/pg_operator.h"
  31 #include "catalog/pg_proc.h"
  32 #include "catalog/pg_statistic.h"
  33 #include "catalog/pg_type.h"
  34 #include "mb/pg_wchar.h"
  35 #include "optimizer/cost.h"
  36 #include "parser/parse_func.h"
  37 #include "parser/parse_oper.h"
  38 #include "utils/builtins.h"
  39 #include "utils/int8.h"
  40 #include "utils/lsyscache.h"
  41 #include "utils/syscache.h"
  42
  43 /* N is not a valid var/constant or relation id */
  44 #define NONVALUE(N)             ((N) == 0)
  45
  46 /* are we looking at a functional index selectivity request? */
  47 #define FunctionalSelectivity(nIndKeys,attNum) ((attNum)==InvalidAttrNumber)
  48
  49 /* default selectivity estimate for equalities such as "A = b" */
  50 #define DEFAULT_EQ_SEL  0.01
  51
  52 /* default selectivity estimate for inequalities such as "A < b" */
  53 #define DEFAULT_INEQ_SEL  (1.0 / 3.0)
  54
  55 /* default selectivity estimate for pattern-match operators such as LIKE */
  56 #define DEFAULT_MATCH_SEL       0.01
  57
  58 /* "fudge factor" for estimating frequency of not-most-common values */
  59 #define NOT_MOST_COMMON_RATIO  0.1
  60
  61 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
  62                                                           Datum lobound, Datum hibound, Oid boundstypid,
  63                                                           double *scaledlobound, double *scaledhibound);
  64 static double convert_numeric_to_scalar(Datum value, Oid typid);
  65 static void convert_string_to_scalar(unsigned char *value,
  66                                                                          double *scaledvalue,
  67                                                                          unsigned char *lobound,
  68                                                                          double *scaledlobound,
  69                                                                          unsigned char *hibound,
  70                                                                          double *scaledhibound);
  71 static double convert_one_string_to_scalar(unsigned char *value,
  72                                                                                    int rangelo, int rangehi);
  73 static unsigned char * convert_string_datum(Datum value, Oid typid);
  74 static double convert_timevalue_to_scalar(Datum value, Oid typid);
  75 static void getattproperties(Oid relid, AttrNumber attnum,
  76                                  Oid *typid,
  77                                  int *typlen,
  78                                  bool *typbyval,
  79                                  int32 *typmod);
  80 static bool getattstatistics(Oid relid, AttrNumber attnum,
  81                                  Oid typid, int32 typmod,
  82                                  double *nullfrac,
  83                                  double *commonfrac,
  84                                  Datum *commonval,
  85                                  Datum *loval,
  86                                  Datum *hival);
  87 static Selectivity prefix_selectivity(char *prefix,
  88                                                                           Oid relid,
  89                                                                           AttrNumber attno,
  90                                                                           Oid datatype);
  91 static Selectivity pattern_selectivity(char *patt, Pattern_Type ptype);
  92 static bool string_lessthan(const char *str1, const char *str2,
  93                                 Oid datatype);
  94 static Oid      find_operator(const char *opname, Oid datatype);
  95 static Datum string_to_datum(const char *str, Oid datatype);
  96
  97
  98 /*
  99  *              eqsel                   - Selectivity of "=" for any data types.
 100  *
 101  * Note: this routine is also used to estimate selectivity for some
 102  * operators that are not "=" but have comparable selectivity behavior,
 103  * such as "~=" (geometric approximate-match).  Even for "=", we must
 104  * keep in mind that the left and right datatypes may differ, so the type
 105  * of the given constant "value" may be different from the type of the
 106  * attribute.
 107  */
 108 float64
 109 eqsel(Oid opid,
 110           Oid relid,
 111           AttrNumber attno,
 112           Datum value,
 113           int32 flag)
 114 {
 115         float64         result;
 116
 117         result = (float64) palloc(sizeof(float64data));
 118         if (NONVALUE(attno) || NONVALUE(relid))
 119                 *result = DEFAULT_EQ_SEL;
 120         else
 121         {
 122                 Oid                     typid;
 123                 int                     typlen;
 124                 bool            typbyval;
 125                 int32           typmod;
 126                 double          nullfrac;
 127                 double          commonfrac;
 128                 Datum           commonval;
 129                 double          selec;
 130
 131                 /* get info about the attribute */
 132                 getattproperties(relid, attno,
 133                                                  &typid, &typlen, &typbyval, &typmod);
 134
 135                 /* get stats for the attribute, if available */
 136                 if (getattstatistics(relid, attno, typid, typmod,
 137                                                          &nullfrac, &commonfrac, &commonval,
 138                                                          NULL, NULL))
 139                 {
 140                         if (flag & SEL_CONSTANT)
 141                         {
 142
 143                                 /*
 144                                  * Is the constant "=" to the column's most common value?
 145                                  * (Although the operator may not really be "=", we will
 146                                  * assume that seeing whether it returns TRUE for the most
 147                                  * common value is useful information. If you don't like
 148                                  * it, maybe you shouldn't be using eqsel for your
 149                                  * operator...)
 150                                  */
 151                                 RegProcedure eqproc = get_opcode(opid);
 152                                 bool            mostcommon;
 153
 154                                 if (eqproc == (RegProcedure) NULL)
 155                                         elog(ERROR, "eqsel: no procedure for operator %u",
 156                                                  opid);
 157
 158                                 /* be careful to apply operator right way 'round */
 159                                 if (flag & SEL_RIGHT)
 160                                         mostcommon = (bool)
 161                                                 DatumGetUInt8(fmgr(eqproc, commonval, value));
 162                                 else
 163                                         mostcommon = (bool)
 164                                                 DatumGetUInt8(fmgr(eqproc, value, commonval));
 165
 166                                 if (mostcommon)
 167                                 {
 168
 169                                         /*
 170                                          * Constant is "=" to the most common value.  We know
 171                                          * selectivity exactly (or as exactly as VACUUM could
 172                                          * calculate it, anyway).
 173                                          */
 174                                         selec = commonfrac;
 175                                 }
 176                                 else
 177                                 {
 178
 179                                         /*
 180                                          * Comparison is against a constant that is neither
 181                                          * the most common value nor null.      Its selectivity
 182                                          * cannot be more than this:
 183                                          */
 184                                         selec = 1.0 - commonfrac - nullfrac;
 185                                         if (selec > commonfrac)
 186                                                 selec = commonfrac;
 187
 188                                         /*
 189                                          * and in fact it's probably less, so we should apply
 190                                          * a fudge factor.      The only case where we don't is
 191                                          * for a boolean column, where indeed we have
 192                                          * estimated the less-common value's frequency
 193                                          * exactly!
 194                                          */
 195                                         if (typid != BOOLOID)
 196                                                 selec *= NOT_MOST_COMMON_RATIO;
 197                                 }
 198                         }
 199                         else
 200                         {
 201
 202                                 /*
 203                                  * Search is for a value that we do not know a priori, but
 204                                  * we will assume it is not NULL.  Selectivity cannot be
 205                                  * more than this:
 206                                  */
 207                                 selec = 1.0 - nullfrac;
 208                                 if (selec > commonfrac)
 209                                         selec = commonfrac;
 210
 211                                 /*
 212                                  * and in fact it's probably less, so apply a fudge
 213                                  * factor.
 214                                  */
 215                                 selec *= NOT_MOST_COMMON_RATIO;
 216                         }
 217
 218                         /* result should be in range, but make sure... */
 219                         if (selec < 0.0)
 220                                 selec = 0.0;
 221                         else if (selec > 1.0)
 222                                 selec = 1.0;
 223
 224                         if (!typbyval)
 225                                 pfree(DatumGetPointer(commonval));
 226                 }
 227                 else
 228                 {
 229
 230                         /*
 231                          * No VACUUM ANALYZE stats available, so make a guess using
 232                          * the disbursion stat (if we have that, which is unlikely for
 233                          * a normal attribute; but for a system attribute we may be
 234                          * able to estimate it).
 235                          */
 236                         selec = get_attdisbursion(relid, attno, 0.01);
 237                 }
 238
 239                 *result = (float64data) selec;
 240         }
 241         return result;
 242 }
 243
 244 /*
 245  *              neqsel                  - Selectivity of "!=" for any data types.
 246  *
 247  * This routine is also used for some operators that are not "!="
 248  * but have comparable selectivity behavior.  See above comments
 249  * for eqsel().
 250  */
 251 float64
 252 neqsel(Oid opid,
 253            Oid relid,
 254            AttrNumber attno,
 255            Datum value,
 256            int32 flag)
 257 {
 258         float64         result;
 259
 260         result = eqsel(opid, relid, attno, value, flag);
 261         *result = 1.0 - *result;
 262         return result;
 263 }
 264
 265 /*
 266  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
 267  *
 268  * This routine works for any datatype (or pair of datatypes) known to
 269  * convert_to_scalar().  If it is applied to some other datatype,
 270  * it will return a default estimate.
 271  */
 272 float64
 273 scalarltsel(Oid opid,
 274                         Oid relid,
 275                         AttrNumber attno,
 276                         Datum value,
 277                         int32 flag)
 278 {
 279         float64         result;
 280
 281         result = (float64) palloc(sizeof(float64data));
 282         if (!(flag & SEL_CONSTANT) || NONVALUE(attno) || NONVALUE(relid))
 283                 *result = DEFAULT_INEQ_SEL;
 284         else
 285         {
 286                 HeapTuple       oprtuple;
 287                 Oid                     ltype,
 288                                         rtype,
 289                                         contype;
 290                 Oid                     typid;
 291                 int                     typlen;
 292                 bool            typbyval;
 293                 int32           typmod;
 294                 Datum           hival,
 295                                         loval;
 296                 double          val,
 297                                         high,
 298                                         low,
 299                                         numerator,
 300                                         denominator;
 301
 302                 /*
 303                  * Get left and right datatypes of the operator so we know what
 304                  * type the constant is.
 305                  */
 306                 oprtuple = get_operator_tuple(opid);
 307                 if (!HeapTupleIsValid(oprtuple))
 308                         elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
 309                 ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
 310                 rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
 311                 contype = (flag & SEL_RIGHT) ? rtype : ltype;
 312
 313                 /* Now get info and stats about the attribute */
 314                 getattproperties(relid, attno,
 315                                                  &typid, &typlen, &typbyval, &typmod);
 316
 317                 if (!getattstatistics(relid, attno, typid, typmod,
 318                                                           NULL, NULL, NULL,
 319                                                           &loval, &hival))
 320                 {
 321                         /* no stats available, so default result */
 322                         *result = DEFAULT_INEQ_SEL;
 323                         return result;
 324                 }
 325
 326                 /* Convert the values to a uniform comparison scale. */
 327                 if (!convert_to_scalar(value, contype, &val,
 328                                                            loval, hival, typid,
 329                                                            &low, &high))
 330                 {
 331
 332                         /*
 333                          * Ideally we'd produce an error here, on the grounds that the
 334                          * given operator shouldn't have scalarltsel registered as its
 335                          * selectivity func unless we can deal with its operand types.
 336                          * But currently, all manner of stuff is invoking scalarltsel,
 337                          * so give a default estimate until that can be fixed.
 338                          */
 339                         if (!typbyval)
 340                         {
 341                                 pfree(DatumGetPointer(hival));
 342                                 pfree(DatumGetPointer(loval));
 343                         }
 344                         *result = DEFAULT_INEQ_SEL;
 345                         return result;
 346                 }
 347
 348                 /* release temp storage if needed */
 349                 if (!typbyval)
 350                 {
 351                         pfree(DatumGetPointer(hival));
 352                         pfree(DatumGetPointer(loval));
 353                 }
 354
 355                 if (high <= low)
 356                 {
 357
 358                         /*
 359                          * If we trusted the stats fully, we could return a small or
 360                          * large selec depending on which side of the single data
 361                          * point the constant is on.  But it seems better to assume
 362                          * that the stats are wrong and return a default...
 363                          */
 364                         *result = DEFAULT_INEQ_SEL;
 365                 }
 366                 else if (val < low || val > high)
 367                 {
 368
 369                         /*
 370                          * If given value is outside the statistical range, return a
 371                          * small or large value; but not 0.0/1.0 since there is a
 372                          * chance the stats are out of date.
 373                          */
 374                         if (flag & SEL_RIGHT)
 375                                 *result = (val < low) ? 0.001 : 0.999;
 376                         else
 377                                 *result = (val < low) ? 0.999 : 0.001;
 378                 }
 379                 else
 380                 {
 381                         denominator = high - low;
 382                         if (flag & SEL_RIGHT)
 383                                 numerator = val - low;
 384                         else
 385                                 numerator = high - val;
 386                         *result = numerator / denominator;
 387                 }
 388         }
 389         return result;
 390 }
 391
 392 /*
 393  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
 394  *
 395  * See above comments for scalarltsel.
 396  */
 397 float64
 398 scalargtsel(Oid opid,
 399                         Oid relid,
 400                         AttrNumber attno,
 401                         Datum value,
 402                         int32 flag)
 403 {
 404         float64         result;
 405
 406         /*
 407          * Compute selectivity of "<", then invert --- but only if we were
 408          * able to produce a non-default estimate.
 409          */
 410         result = scalarltsel(opid, relid, attno, value, flag);
 411         if (*result != DEFAULT_INEQ_SEL)
 412                 *result = 1.0 - *result;
 413         return result;
 414 }
 415
 416 /*
 417  * patternsel                   - Generic code for pattern-match selectivity.
 418  */
 419 static float64
 420 patternsel(Oid opid,
 421                    Pattern_Type ptype,
 422                    Oid relid,
 423                    AttrNumber attno,
 424                    Datum value,
 425                    int32 flag)
 426 {
 427         float64         result;
 428
 429         result = (float64) palloc(sizeof(float64data));
 430         /* Must have a constant for the pattern, or cannot learn anything */
 431         if ((flag & (SEL_CONSTANT | SEL_RIGHT)) != (SEL_CONSTANT | SEL_RIGHT))
 432                 *result = DEFAULT_MATCH_SEL;
 433         else
 434         {
 435                 HeapTuple       oprtuple;
 436                 Oid                     ltype,
 437                                         rtype;
 438                 char       *patt;
 439                 Pattern_Prefix_Status pstatus;
 440                 char       *prefix;
 441                 char       *rest;
 442
 443                 /*
 444                  * Get left and right datatypes of the operator so we know what
 445                  * type the attribute is.
 446                  */
 447                 oprtuple = get_operator_tuple(opid);
 448                 if (!HeapTupleIsValid(oprtuple))
 449                         elog(ERROR, "patternsel: no tuple for operator %u", opid);
 450                 ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
 451                 rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
 452
 453                 /* the right-hand const is type text for all supported operators */
 454                 Assert(rtype == TEXTOID);
 455                 patt = textout((text *) DatumGetPointer(value));
 456
 457                 /* divide pattern into fixed prefix and remainder */
 458                 pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
 459
 460                 if (pstatus == Pattern_Prefix_Exact)
 461                 {
 462                         /* Pattern specifies an exact match, so pretend operator is '=' */
 463                         Oid             eqopr = find_operator("=", ltype);
 464                         Datum   eqcon;
 465
 466                         if (eqopr == InvalidOid)
 467                                 elog(ERROR, "patternsel: no = operator for type %u", ltype);
 468                         eqcon = string_to_datum(prefix, ltype);
 469                         result = eqsel(eqopr, relid, attno, eqcon, SEL_CONSTANT|SEL_RIGHT);
 470                         pfree(DatumGetPointer(eqcon));
 471                 }
 472                 else
 473                 {
 474                         /*
 475                          * Not exact-match pattern.  We estimate selectivity of the
 476                          * fixed prefix and remainder of pattern separately, then
 477                          * combine the two.
 478                          */
 479                         Selectivity prefixsel;
 480                         Selectivity restsel;
 481                         Selectivity selec;
 482
 483                         if (pstatus == Pattern_Prefix_Partial)
 484                                 prefixsel = prefix_selectivity(prefix, relid, attno, ltype);
 485                         else
 486                                 prefixsel = 1.0;
 487                         restsel = pattern_selectivity(rest, ptype);
 488                         selec = prefixsel * restsel;
 489                         /* result should be in range, but make sure... */
 490                         if (selec < 0.0)
 491                                 selec = 0.0;
 492                         else if (selec > 1.0)
 493                                 selec = 1.0;
 494                         *result = (float64data) selec;
 495                 }
 496                 if (prefix)
 497                         pfree(prefix);
 498                 pfree(patt);
 499         }
 500         return result;
 501 }
 502
 503 /*
 504  *              regexeqsel              - Selectivity of regular-expression pattern match.
 505  */
 506 float64
 507 regexeqsel(Oid opid,
 508                    Oid relid,
 509                    AttrNumber attno,
 510                    Datum value,
 511                    int32 flag)
 512 {
 513         return patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
 514 }
 515
 516 /*
 517  *              icregexeqsel    - Selectivity of case-insensitive regex match.
 518  */
 519 float64
 520 icregexeqsel(Oid opid,
 521                          Oid relid,
 522                          AttrNumber attno,
 523                          Datum value,
 524                          int32 flag)
 525 {
 526         return patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
 527 }
 528
 529 /*
 530  *              likesel                 - Selectivity of LIKE pattern match.
 531  */
 532 float64
 533 likesel(Oid opid,
 534                 Oid relid,
 535                 AttrNumber attno,
 536                 Datum value,
 537                 int32 flag)
 538 {
 539         return patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
 540 }
 541
 542 /*
 543  *              regexnesel              - Selectivity of regular-expression pattern non-match.
 544  */
 545 float64
 546 regexnesel(Oid opid,
 547                    Oid relid,
 548                    AttrNumber attno,
 549                    Datum value,
 550                    int32 flag)
 551 {
 552         float64         result;
 553
 554         result = patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
 555         *result = 1.0 - *result;
 556         return result;
 557 }
 558
 559 /*
 560  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
 561  */
 562 float64
 563 icregexnesel(Oid opid,
 564                          Oid relid,
 565                          AttrNumber attno,
 566                          Datum value,
 567                          int32 flag)
 568 {
 569         float64         result;
 570
 571         result = patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
 572         *result = 1.0 - *result;
 573         return result;
 574 }
 575
 576 /*
 577  *              nlikesel                - Selectivity of LIKE pattern non-match.
 578  */
 579 float64
 580 nlikesel(Oid opid,
 581                  Oid relid,
 582                  AttrNumber attno,
 583                  Datum value,
 584                  int32 flag)
 585 {
 586         float64         result;
 587
 588         result = patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
 589         *result = 1.0 - *result;
 590         return result;
 591 }
 592
 593 /*
 594  *              eqjoinsel               - Join selectivity of "="
 595  */
 596 float64
 597 eqjoinsel(Oid opid,
 598                   Oid relid1,
 599                   AttrNumber attno1,
 600                   Oid relid2,
 601                   AttrNumber attno2)
 602 {
 603         float64         result;
 604         float64data num1,
 605                                 num2,
 606                                 min;
 607         bool            unknown1 = NONVALUE(relid1) || NONVALUE(attno1);
 608         bool            unknown2 = NONVALUE(relid2) || NONVALUE(attno2);
 609
 610         result = (float64) palloc(sizeof(float64data));
 611         if (unknown1 && unknown2)
 612                 *result = DEFAULT_EQ_SEL;
 613         else
 614         {
 615                 num1 = unknown1 ? 1.0 : get_attdisbursion(relid1, attno1, 0.01);
 616                 num2 = unknown2 ? 1.0 : get_attdisbursion(relid2, attno2, 0.01);
 617
 618                 /*
 619                  * The join selectivity cannot be more than num2, since each tuple
 620                  * in table 1 could match no more than num2 fraction of tuples in
 621                  * table 2 (and that's only if the table-1 tuple matches the most
 622                  * common value in table 2, so probably it's less).  By the same
 623                  * reasoning it is not more than num1. The min is therefore an
 624                  * upper bound.
 625                  *
 626                  * If we know the disbursion of only one side, use it; the reasoning
 627                  * above still works.
 628                  *
 629                  * XXX can we make a better estimate here?      Using the nullfrac
 630                  * statistic might be helpful, for example.  Assuming the operator
 631                  * is strict (does not succeed for null inputs) then the
 632                  * selectivity couldn't be more than (1-nullfrac1)*(1-nullfrac2),
 633                  * which might be usefully small if there are many nulls.  How
 634                  * about applying the operator to the most common values?
 635                  */
 636                 min = (num1 < num2) ? num1 : num2;
 637                 *result = min;
 638         }
 639         return result;
 640 }
 641
 642 /*
 643  *              neqjoinsel              - Join selectivity of "!="
 644  */
 645 float64
 646 neqjoinsel(Oid opid,
 647                    Oid relid1,
 648                    AttrNumber attno1,
 649                    Oid relid2,
 650                    AttrNumber attno2)
 651 {
 652         float64         result;
 653
 654         result = eqjoinsel(opid, relid1, attno1, relid2, attno2);
 655         *result = 1.0 - *result;
 656         return result;
 657 }
 658
 659 /*
 660  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
 661  */
 662 float64
 663 scalarltjoinsel(Oid opid,
 664                                 Oid relid1,
 665                                 AttrNumber attno1,
 666                                 Oid relid2,
 667                                 AttrNumber attno2)
 668 {
 669         float64         result;
 670
 671         result = (float64) palloc(sizeof(float64data));
 672         *result = DEFAULT_INEQ_SEL;
 673         return result;
 674 }
 675
 676 /*
 677  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
 678  */
 679 float64
 680 scalargtjoinsel(Oid opid,
 681                                 Oid relid1,
 682                                 AttrNumber attno1,
 683                                 Oid relid2,
 684                                 AttrNumber attno2)
 685 {
 686         float64         result;
 687
 688         result = (float64) palloc(sizeof(float64data));
 689         *result = DEFAULT_INEQ_SEL;
 690         return result;
 691 }
 692
 693 /*
 694  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
 695  */
 696 float64
 697 regexeqjoinsel(Oid opid,
 698                            Oid relid1,
 699                            AttrNumber attno1,
 700                            Oid relid2,
 701                            AttrNumber attno2)
 702 {
 703         float64         result;
 704
 705         result = (float64) palloc(sizeof(float64data));
 706         *result = DEFAULT_MATCH_SEL;
 707         return result;
 708 }
 709
 710 /*
 711  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
 712  */
 713 float64
 714 icregexeqjoinsel(Oid opid,
 715                                  Oid relid1,
 716                                  AttrNumber attno1,
 717                                  Oid relid2,
 718                                  AttrNumber attno2)
 719 {
 720         float64         result;
 721
 722         result = (float64) palloc(sizeof(float64data));
 723         *result = DEFAULT_MATCH_SEL;
 724         return result;
 725 }
 726
 727 /*
 728  *              likejoinsel                     - Join selectivity of LIKE pattern match.
 729  */
 730 float64
 731 likejoinsel(Oid opid,
 732                         Oid relid1,
 733                         AttrNumber attno1,
 734                         Oid relid2,
 735                         AttrNumber attno2)
 736 {
 737         float64         result;
 738
 739         result = (float64) palloc(sizeof(float64data));
 740         *result = DEFAULT_MATCH_SEL;
 741         return result;
 742 }
 743
 744 /*
 745  *              regexnejoinsel  - Join selectivity of regex non-match.
 746  */
 747 float64
 748 regexnejoinsel(Oid opid,
 749                            Oid relid1,
 750                            AttrNumber attno1,
 751                            Oid relid2,
 752                            AttrNumber attno2)
 753 {
 754         float64         result;
 755
 756         result = regexeqjoinsel(opid, relid1, attno1, relid2, attno2);
 757         *result = 1.0 - *result;
 758         return result;
 759 }
 760
 761 /*
 762  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
 763  */
 764 float64
 765 icregexnejoinsel(Oid opid,
 766                                  Oid relid1,
 767                                  AttrNumber attno1,
 768                                  Oid relid2,
 769                                  AttrNumber attno2)
 770 {
 771         float64         result;
 772
 773         result = icregexeqjoinsel(opid, relid1, attno1, relid2, attno2);
 774         *result = 1.0 - *result;
 775         return result;
 776 }
 777
 778 /*
 779  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
 780  */
 781 float64
 782 nlikejoinsel(Oid opid,
 783                          Oid relid1,
 784                          AttrNumber attno1,
 785                          Oid relid2,
 786                          AttrNumber attno2)
 787 {
 788         float64         result;
 789
 790         result = likejoinsel(opid, relid1, attno1, relid2, attno2);
 791         *result = 1.0 - *result;
 792         return result;
 793 }
 794
 795
 796 /*
 797  * convert_to_scalar
 798  *        Convert non-NULL values of the indicated types to the comparison
 799  *        scale needed by scalarltsel()/scalargtsel().
 800  *        Returns "true" if successful.
 801  *
 802  * All numeric datatypes are simply converted to their equivalent
 803  * "double" values.
 804  *
 805  * String datatypes are converted by convert_string_to_scalar(),
 806  * which is explained below.  The reason why this routine deals with
 807  * three values at a time, not just one, is that we need it for strings.
 808  *
 809  * The several datatypes representing absolute times are all converted
 810  * to Timestamp, which is actually a double, and then we just use that
 811  * double value.  Note this will give bad results for the various "special"
 812  * values of Timestamp --- what can we do with those?
 813  *
 814  * The several datatypes representing relative times (intervals) are all
 815  * converted to measurements expressed in seconds.
 816  */
 817 static bool
 818 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 819                                   Datum lobound, Datum hibound, Oid boundstypid,
 820                                   double *scaledlobound, double *scaledhibound)
 821 {
 822         switch (valuetypid)
 823         {
 824
 825                 /*
 826                  * Built-in numeric types
 827                  */
 828                 case BOOLOID:
 829                 case INT2OID:
 830                 case INT4OID:
 831                 case INT8OID:
 832                 case FLOAT4OID:
 833                 case FLOAT8OID:
 834                 case NUMERICOID:
 835                 case OIDOID:
 836                 case REGPROCOID:
 837                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
 838                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
 839                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
 840                         return true;
 841
 842                 /*
 843                  * Built-in string types
 844                  */
 845                 case CHAROID:
 846                 case BPCHAROID:
 847                 case VARCHAROID:
 848                 case TEXTOID:
 849                 case NAMEOID:
 850                 {
 851                         unsigned char *valstr = convert_string_datum(value, valuetypid);
 852                         unsigned char *lostr = convert_string_datum(lobound, boundstypid);
 853                         unsigned char *histr = convert_string_datum(hibound, boundstypid);
 854
 855                         convert_string_to_scalar(valstr, scaledvalue,
 856                                                                          lostr, scaledlobound,
 857                                                                          histr, scaledhibound);
 858                         pfree(valstr);
 859                         pfree(lostr);
 860                         pfree(histr);
 861                         return true;
 862                 }
 863
 864                 /*
 865                  * Built-in time types
 866                  */
 867                 case TIMESTAMPOID:
 868                 case ABSTIMEOID:
 869                 case DATEOID:
 870                 case INTERVALOID:
 871                 case RELTIMEOID:
 872                 case TINTERVALOID:
 873                 case TIMEOID:
 874                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
 875                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
 876                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
 877                         return true;
 878         }
 879         /* Don't know how to convert */
 880         return false;
 881 }
 882
 883 /*
 884  * Do convert_to_scalar()'s work for any numeric data type.
 885  */
 886 static double
 887 convert_numeric_to_scalar(Datum value, Oid typid)
 888 {
 889         switch (typid)
 890         {
 891                 case BOOLOID:
 892                         return (double) DatumGetUInt8(value);
 893                 case INT2OID:
 894                         return (double) DatumGetInt16(value);
 895                 case INT4OID:
 896                         return (double) DatumGetInt32(value);
 897                 case INT8OID:
 898                         return (double) (*i8tod((int64 *) DatumGetPointer(value)));
 899                 case FLOAT4OID:
 900                         return (double) (*DatumGetFloat32(value));
 901                 case FLOAT8OID:
 902                         return (double) (*DatumGetFloat64(value));
 903                 case NUMERICOID:
 904                         return (double) (*numeric_float8((Numeric) DatumGetPointer(value)));
 905                 case OIDOID:
 906                 case REGPROCOID:
 907                         /* we can treat OIDs as integers... */
 908                         return (double) DatumGetObjectId(value);
 909         }
 910         /* Can't get here unless someone tries to use scalarltsel/scalargtsel
 911          * on an operator with one numeric and one non-numeric operand.
 912          */
 913         elog(ERROR, "convert_numeric_to_scalar: unsupported type %u", typid);
 914         return 0;
 915 }
 916
 917 /*
 918  * Do convert_to_scalar()'s work for any character-string data type.
 919  *
 920  * String datatypes are converted to a scale that ranges from 0 to 1,
 921  * where we visualize the bytes of the string as fractional digits.
 922  *
 923  * We do not want the base to be 256, however, since that tends to
 924  * generate inflated selectivity estimates; few databases will have
 925  * occurrences of all 256 possible byte values at each position.
 926  * Instead, use the smallest and largest byte values seen in the bounds
 927  * as the estimated range for each byte, after some fudging to deal with
 928  * the fact that we probably aren't going to see the full range that way.
 929  *
 930  * An additional refinement is that we discard any common prefix of the
 931  * three strings before computing the scaled values.  This allows us to
 932  * "zoom in" when we encounter a narrow data range.  An example is a phone
 933  * number database where all the values begin with the same area code.
 934  */
 935 static void
 936 convert_string_to_scalar(unsigned char *value,
 937                                                  double *scaledvalue,
 938                                                  unsigned char *lobound,
 939                                                  double *scaledlobound,
 940                                                  unsigned char *hibound,
 941                                                  double *scaledhibound)
 942 {
 943         int                     rangelo,
 944                                 rangehi;
 945         unsigned char *sptr;
 946
 947         rangelo = rangehi = hibound[0];
 948         for (sptr = lobound; *sptr; sptr++)
 949         {
 950                 if (rangelo > *sptr)
 951                         rangelo = *sptr;
 952                 if (rangehi < *sptr)
 953                         rangehi = *sptr;
 954         }
 955         for (sptr = hibound; *sptr; sptr++)
 956         {
 957                 if (rangelo > *sptr)
 958                         rangelo = *sptr;
 959                 if (rangehi < *sptr)
 960                         rangehi = *sptr;
 961         }
 962         /* If range includes any upper-case ASCII chars, make it include all */
 963         if (rangelo <= 'Z' && rangehi >= 'A')
 964         {
 965                 if (rangelo > 'A')
 966                         rangelo = 'A';
 967                 if (rangehi < 'Z')
 968                         rangehi = 'Z';
 969         }
 970         /* Ditto lower-case */
 971         if (rangelo <= 'z' && rangehi >= 'a')
 972         {
 973                 if (rangelo > 'a')
 974                         rangelo = 'a';
 975                 if (rangehi < 'z')
 976                         rangehi = 'z';
 977         }
 978         /* Ditto digits */
 979         if (rangelo <= '9' && rangehi >= '0')
 980         {
 981                 if (rangelo > '0')
 982                         rangelo = '0';
 983                 if (rangehi < '9')
 984                         rangehi = '9';
 985         }
 986         /* If range includes less than 10 chars, assume we have not got enough
 987          * data, and make it include regular ASCII set.
 988          */
 989         if (rangehi - rangelo < 9)
 990         {
 991                 rangelo = ' ';
 992                 rangehi = 127;
 993         }
 994
 995         /*
 996          * Now strip any common prefix of the three strings.
 997          */
 998         while (*lobound)
 999         {
1000                 if (*lobound != *hibound || *lobound != *value)
1001                         break;
1002                 lobound++, hibound++, value++;
1003         }
1004
1005         /*
1006          * Now we can do the conversions.
1007          */
1008         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
1009         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
1010         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
1011 }
1012
1013 static double
1014 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
1015 {
1016         int                     slen = strlen((char *) value);
1017         double          num,
1018                                 denom,
1019                                 base;
1020
1021         if (slen <= 0)
1022                 return 0.0;                             /* empty string has scalar value 0 */
1023
1024         /* Since base is at least 10, need not consider more than about 20 chars */
1025         if (slen > 20)
1026                 slen = 20;
1027
1028         /* Convert initial characters to fraction */
1029         base = rangehi - rangelo + 1;
1030         num = 0.0;
1031         denom = base;
1032         while (slen-- > 0)
1033         {
1034                 int             ch = *value++;
1035
1036                 if (ch < rangelo)
1037                         ch = rangelo-1;
1038                 else if (ch > rangehi)
1039                         ch = rangehi+1;
1040                 num += ((double) (ch - rangelo)) / denom;
1041                 denom *= base;
1042         }
1043
1044         return num;
1045 }
1046
1047 /*
1048  * Convert a string-type Datum into a palloc'd, null-terminated string.
1049  *
1050  * If USE_LOCALE is defined, we must pass the string through strxfrm()
1051  * before continuing, so as to generate correct locale-specific results.
1052  */
1053 static unsigned char *
1054 convert_string_datum(Datum value, Oid typid)
1055 {
1056         char       *val;
1057 #ifdef USE_LOCALE
1058         char       *xfrmstr;
1059         size_t          xfrmsize;
1060         size_t          xfrmlen;
1061 #endif
1062
1063         switch (typid)
1064         {
1065                 case CHAROID:
1066                         val = (char *) palloc(2);
1067                         val[0] = DatumGetChar(value);
1068                         val[1] = '\0';
1069                         break;
1070                 case BPCHAROID:
1071                 case VARCHAROID:
1072                 case TEXTOID:
1073                 {
1074                         char       *str = (char *) VARDATA(DatumGetPointer(value));
1075                         int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
1076
1077                         val = (char *) palloc(strlength+1);
1078                         memcpy(val, str, strlength);
1079                         val[strlength] = '\0';
1080                         break;
1081                 }
1082                 case NAMEOID:
1083                 {
1084                         NameData   *nm = (NameData *) DatumGetPointer(value);
1085
1086                         val = pstrdup(NameStr(*nm));
1087                         break;
1088                 }
1089                 default:
1090                         /* Can't get here unless someone tries to use scalarltsel
1091                          * on an operator with one string and one non-string operand.
1092                          */
1093                         elog(ERROR, "convert_string_datum: unsupported type %u", typid);
1094                         return NULL;
1095         }
1096
1097 #ifdef USE_LOCALE
1098         /* Guess that transformed string is not much bigger than original */
1099         xfrmsize = strlen(val) + 32;            /* arbitrary pad value here... */
1100         xfrmstr = (char *) palloc(xfrmsize);
1101         xfrmlen = strxfrm(xfrmstr, val, xfrmsize);
1102         if (xfrmlen >= xfrmsize)
1103         {
1104                 /* Oops, didn't make it */
1105                 pfree(xfrmstr);
1106                 xfrmstr = (char *) palloc(xfrmlen + 1);
1107                 xfrmlen = strxfrm(xfrmstr, val, xfrmlen + 1);
1108         }
1109         pfree(val);
1110         val = xfrmstr;
1111 #endif
1112
1113         return (unsigned char *) val;
1114 }
1115
1116 /*
1117  * Do convert_to_scalar()'s work for any timevalue data type.
1118  */
1119 static double
1120 convert_timevalue_to_scalar(Datum value, Oid typid)
1121 {
1122         switch (typid)
1123         {
1124                 case TIMESTAMPOID:
1125                         return *((Timestamp *) DatumGetPointer(value));
1126                 case ABSTIMEOID:
1127                         return *abstime_timestamp(value);
1128                 case DATEOID:
1129                         return *date_timestamp(value);
1130                 case INTERVALOID:
1131                 {
1132                         Interval   *interval = (Interval *) DatumGetPointer(value);
1133
1134                         /*
1135                          * Convert the month part of Interval to days using
1136                          * assumed average month length of 365.25/12.0 days.  Not
1137                          * too accurate, but plenty good enough for our purposes.
1138                          */
1139                         return interval->time +
1140                                 interval->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
1141                 }
1142                 case RELTIMEOID:
1143                         return (RelativeTime) DatumGetInt32(value);
1144                 case TINTERVALOID:
1145                 {
1146                         TimeInterval interval = (TimeInterval) DatumGetPointer(value);
1147
1148                         if (interval->status != 0)
1149                                 return interval->data[1] - interval->data[0];
1150                         return 0;                       /* for lack of a better idea */
1151                 }
1152                 case TIMEOID:
1153                         return *((TimeADT *) DatumGetPointer(value));
1154         }
1155         /* Can't get here unless someone tries to use scalarltsel/scalargtsel
1156          * on an operator with one timevalue and one non-timevalue operand.
1157          */
1158         elog(ERROR, "convert_timevalue_to_scalar: unsupported type %u", typid);
1159         return 0;
1160 }
1161
1162
1163 /*
1164  * getattproperties
1165  *        Retrieve pg_attribute properties for an attribute,
1166  *        including type OID, type len, type byval flag, typmod.
1167  */
1168 static void
1169 getattproperties(Oid relid, AttrNumber attnum,
1170                                  Oid *typid, int *typlen, bool *typbyval, int32 *typmod)
1171 {
1172         HeapTuple       atp;
1173         Form_pg_attribute att_tup;
1174
1175         atp = SearchSysCacheTuple(ATTNUM,
1176                                                           ObjectIdGetDatum(relid),
1177                                                           Int16GetDatum(attnum),
1178                                                           0, 0);
1179         if (!HeapTupleIsValid(atp))
1180                 elog(ERROR, "getattproperties: no attribute tuple %u %d",
1181                          relid, (int) attnum);
1182         att_tup = (Form_pg_attribute) GETSTRUCT(atp);
1183
1184         *typid = att_tup->atttypid;
1185         *typlen = att_tup->attlen;
1186         *typbyval = att_tup->attbyval;
1187         *typmod = att_tup->atttypmod;
1188 }
1189
1190 /*
1191  * getattstatistics
1192  *        Retrieve the pg_statistic data for an attribute.
1193  *        Returns 'false' if no stats are available.
1194  *
1195  * Inputs:
1196  * 'relid' and 'attnum' are the relation and attribute number.
1197  * 'typid' and 'typmod' are the type and typmod of the column,
1198  * which the caller must already have looked up.
1199  *
1200  * Outputs:
1201  * The available stats are nullfrac, commonfrac, commonval, loval, hival.
1202  * The caller need not retrieve all five --- pass NULL pointers for the
1203  * unwanted values.
1204  *
1205  * commonval, loval, hival are returned as Datums holding the internal
1206  * representation of the values.  (Note that these should be pfree'd
1207  * after use if the data type is not by-value.)
1208  */
1209 static bool
1210 getattstatistics(Oid relid,
1211                                  AttrNumber attnum,
1212                                  Oid typid,
1213                                  int32 typmod,
1214                                  double *nullfrac,
1215                                  double *commonfrac,
1216                                  Datum *commonval,
1217                                  Datum *loval,
1218                                  Datum *hival)
1219 {
1220         HeapTuple       tuple;
1221         HeapTuple       typeTuple;
1222         FmgrInfo        inputproc;
1223         Oid                     typelem;
1224         bool            isnull;
1225
1226         /*
1227          * We assume that there will only be one entry in pg_statistic for the
1228          * given rel/att, so we search WITHOUT considering the staop column.
1229          * Someday, VACUUM might store more than one entry per rel/att,
1230          * corresponding to more than one possible sort ordering defined for
1231          * the column type.  However, to make that work we will need to figure
1232          * out which staop to search for --- it's not necessarily the one we
1233          * have at hand!  (For example, we might have a '>' operator rather
1234          * than the '<' operator that will appear in staop.)
1235          */
1236         tuple = SearchSysCacheTuple(STATRELID,
1237                                                                 ObjectIdGetDatum(relid),
1238                                                                 Int16GetDatum((int16) attnum),
1239                                                                 0,
1240                                                                 0);
1241         if (!HeapTupleIsValid(tuple))
1242         {
1243                 /* no such stats entry */
1244                 return false;
1245         }
1246
1247         if (nullfrac)
1248                 *nullfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stanullfrac;
1249         if (commonfrac)
1250                 *commonfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stacommonfrac;
1251
1252         /* Get the type input proc for the column datatype */
1253         typeTuple = SearchSysCacheTuple(TYPEOID,
1254                                                                         ObjectIdGetDatum(typid),
1255                                                                         0, 0, 0);
1256         if (!HeapTupleIsValid(typeTuple))
1257                 elog(ERROR, "getattstatistics: Cache lookup failed for type %u",
1258                          typid);
1259         fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc);
1260         typelem = ((Form_pg_type) GETSTRUCT(typeTuple))->typelem;
1261
1262         /*
1263          * Values are variable-length fields, so cannot access as struct
1264          * fields. Must do it the hard way with SysCacheGetAttr.
1265          */
1266         if (commonval)
1267         {
1268                 text       *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1269                                                                                   Anum_pg_statistic_stacommonval,
1270                                                                                                    &isnull);
1271
1272                 if (isnull)
1273                 {
1274                         elog(DEBUG, "getattstatistics: stacommonval is null");
1275                         *commonval = PointerGetDatum(NULL);
1276                 }
1277                 else
1278                 {
1279                         char       *strval = textout(val);
1280
1281                         *commonval = (Datum)
1282                                 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1283                         pfree(strval);
1284                 }
1285         }
1286
1287         if (loval)
1288         {
1289                 text       *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1290                                                                                           Anum_pg_statistic_staloval,
1291                                                                                                    &isnull);
1292
1293                 if (isnull)
1294                 {
1295                         elog(DEBUG, "getattstatistics: staloval is null");
1296                         *loval = PointerGetDatum(NULL);
1297                 }
1298                 else
1299                 {
1300                         char       *strval = textout(val);
1301
1302                         *loval = (Datum)
1303                                 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1304                         pfree(strval);
1305                 }
1306         }
1307
1308         if (hival)
1309         {
1310                 text       *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1311                                                                                           Anum_pg_statistic_stahival,
1312                                                                                                    &isnull);
1313
1314                 if (isnull)
1315                 {
1316                         elog(DEBUG, "getattstatistics: stahival is null");
1317                         *hival = PointerGetDatum(NULL);
1318                 }
1319                 else
1320                 {
1321                         char       *strval = textout(val);
1322
1323                         *hival = (Datum)
1324                                 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1325                         pfree(strval);
1326                 }
1327         }
1328
1329         return true;
1330 }
1331
1332 /*-------------------------------------------------------------------------
1333  *
1334  * Pattern analysis functions
1335  *
1336  * These routines support analysis of LIKE and regular-expression patterns
1337  * by the planner/optimizer.  It's important that they agree with the
1338  * regular-expression code in backend/regex/ and the LIKE code in
1339  * backend/utils/adt/like.c.
1340  *
1341  * Note that the prefix-analysis functions are called from
1342  * backend/optimizer/path/indxpath.c as well as from routines in this file.
1343  *
1344  *-------------------------------------------------------------------------
1345  */
1346
1347 /*
1348  * Extract the fixed prefix, if any, for a pattern.
1349  * *prefix is set to a palloc'd prefix string,
1350  * or to NULL if no fixed prefix exists for the pattern.
1351  * *rest is set to point to the remainder of the pattern after the
1352  * portion describing the fixed prefix.
1353  * The return value distinguishes no fixed prefix, a partial prefix,
1354  * or an exact-match-only pattern.
1355  */
1356
1357 static Pattern_Prefix_Status
1358 like_fixed_prefix(char *patt, char **prefix, char **rest)
1359 {
1360         char       *match;
1361         int                     pos,
1362                                 match_pos;
1363
1364         *prefix = match = palloc(strlen(patt) + 1);
1365         match_pos = 0;
1366
1367         for (pos = 0; patt[pos]; pos++)
1368         {
1369                 /* % and _ are wildcard characters in LIKE */
1370                 if (patt[pos] == '%' ||
1371                         patt[pos] == '_')
1372                         break;
1373                 /* Backslash quotes the next character */
1374                 if (patt[pos] == '\\')
1375                 {
1376                         pos++;
1377                         if (patt[pos] == '\0')
1378                                 break;
1379                 }
1380
1381                 /*
1382                  * NOTE: this code used to think that %% meant a literal %, but
1383                  * textlike() itself does not think that, and the SQL92 spec
1384                  * doesn't say any such thing either.
1385                  */
1386                 match[match_pos++] = patt[pos];
1387         }
1388
1389         match[match_pos] = '\0';
1390         *rest = &patt[pos];
1391
1392         /* in LIKE, an empty pattern is an exact match! */
1393         if (patt[pos] == '\0')
1394                 return Pattern_Prefix_Exact;    /* reached end of pattern, so exact */
1395
1396         if (match_pos > 0)
1397                 return Pattern_Prefix_Partial;
1398
1399         pfree(match);
1400         *prefix = NULL;
1401         return Pattern_Prefix_None;
1402 }
1403
1404 static Pattern_Prefix_Status
1405 regex_fixed_prefix(char *patt, bool case_insensitive,
1406                                    char **prefix, char **rest)
1407 {
1408         char       *match;
1409         int                     pos,
1410                                 match_pos,
1411                                 paren_depth;
1412
1413         /* Pattern must be anchored left */
1414         if (patt[0] != '^')
1415         {
1416                 *prefix = NULL;
1417                 *rest = patt;
1418                 return Pattern_Prefix_None;
1419         }
1420
1421         /* If unquoted | is present at paren level 0 in pattern, then there
1422          * are multiple alternatives for the start of the string.
1423          */
1424         paren_depth = 0;
1425         for (pos = 1; patt[pos]; pos++)
1426         {
1427                 if (patt[pos] == '|' && paren_depth == 0)
1428                 {
1429                         *prefix = NULL;
1430                         *rest = patt;
1431                         return Pattern_Prefix_None;
1432                 }
1433                 else if (patt[pos] == '(')
1434                         paren_depth++;
1435                 else if (patt[pos] == ')' && paren_depth > 0)
1436                         paren_depth--;
1437                 else if (patt[pos] == '\\')
1438                 {
1439                         /* backslash quotes the next character */
1440                         pos++;
1441                         if (patt[pos] == '\0')
1442                                 break;
1443                 }
1444         }
1445
1446         /* OK, allocate space for pattern */
1447         *prefix = match = palloc(strlen(patt) + 1);
1448         match_pos = 0;
1449
1450         /* note start at pos 1 to skip leading ^ */
1451         for (pos = 1; patt[pos]; pos++)
1452         {
1453                 /*
1454                  * Check for characters that indicate multiple possible matches here.
1455                  * XXX I suspect isalpha() is not an adequately locale-sensitive
1456                  * test for characters that can vary under case folding?
1457                  */
1458                 if (patt[pos] == '.' ||
1459                         patt[pos] == '(' ||
1460                         patt[pos] == '[' ||
1461                         patt[pos] == '$' ||
1462                         (case_insensitive && isalpha(patt[pos])))
1463                         break;
1464                 /*
1465                  * Check for quantifiers.  Except for +, this means the preceding
1466                  * character is optional, so we must remove it from the prefix too!
1467                  */
1468                 if (patt[pos] == '*' ||
1469                         patt[pos] == '?' ||
1470                         patt[pos] == '{')
1471                 {
1472                         if (match_pos > 0)
1473                                 match_pos--;
1474                         pos--;
1475                         break;
1476                 }
1477                 if (patt[pos] == '+')
1478                 {
1479                         pos--;
1480                         break;
1481                 }
1482                 if (patt[pos] == '\\')
1483                 {
1484                         /* backslash quotes the next character */
1485                         pos++;
1486                         if (patt[pos] == '\0')
1487                                 break;
1488                 }
1489                 match[match_pos++] = patt[pos];
1490         }
1491
1492         match[match_pos] = '\0';
1493         *rest = &patt[pos];
1494
1495         if (patt[pos] == '$' && patt[pos + 1] == '\0')
1496         {
1497                 *rest = &patt[pos + 1];
1498                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
1499         }
1500
1501         if (match_pos > 0)
1502                 return Pattern_Prefix_Partial;
1503
1504         pfree(match);
1505         *prefix = NULL;
1506         return Pattern_Prefix_None;
1507 }
1508
1509 Pattern_Prefix_Status
1510 pattern_fixed_prefix(char *patt, Pattern_Type ptype,
1511                                          char **prefix, char **rest)
1512 {
1513         Pattern_Prefix_Status result;
1514
1515         switch (ptype)
1516         {
1517                 case Pattern_Type_Like:
1518                         result = like_fixed_prefix(patt, prefix, rest);
1519                         break;
1520                 case Pattern_Type_Regex:
1521                         result = regex_fixed_prefix(patt, false, prefix, rest);
1522                         break;
1523                 case Pattern_Type_Regex_IC:
1524                         result = regex_fixed_prefix(patt, true, prefix, rest);
1525                         break;
1526                 default:
1527                         elog(ERROR, "pattern_fixed_prefix: bogus ptype");
1528                         result = Pattern_Prefix_None; /* keep compiler quiet */
1529                         break;
1530         }
1531         return result;
1532 }
1533
1534 /*
1535  * Estimate the selectivity of a fixed prefix for a pattern match.
1536  *
1537  * A fixed prefix "foo" is estimated as the selectivity of the expression
1538  * "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
1539  */
1540 static Selectivity
1541 prefix_selectivity(char *prefix,
1542                                    Oid relid,
1543                                    AttrNumber attno,
1544                                    Oid datatype)
1545 {
1546         Selectivity     prefixsel;
1547         Oid                     cmpopr;
1548         Datum           prefixcon;
1549         char       *greaterstr;
1550
1551         cmpopr = find_operator(">=", datatype);
1552         if (cmpopr == InvalidOid)
1553                 elog(ERROR, "prefix_selectivity: no >= operator for type %u",
1554                          datatype);
1555         prefixcon = string_to_datum(prefix, datatype);
1556         /* Assume scalargtsel is appropriate for all supported types */
1557         prefixsel = * scalargtsel(cmpopr, relid, attno,
1558                                                           prefixcon, SEL_CONSTANT|SEL_RIGHT);
1559         pfree(DatumGetPointer(prefixcon));
1560
1561         /*
1562          * If we can create a string larger than the prefix,
1563          * say "x < greaterstr".
1564          */
1565         greaterstr = make_greater_string(prefix, datatype);
1566         if (greaterstr)
1567         {
1568                 Selectivity             topsel;
1569
1570                 cmpopr = find_operator("<", datatype);
1571                 if (cmpopr == InvalidOid)
1572                         elog(ERROR, "prefix_selectivity: no < operator for type %u",
1573                                  datatype);
1574                 prefixcon = string_to_datum(greaterstr, datatype);
1575                 /* Assume scalarltsel is appropriate for all supported types */
1576                 topsel = * scalarltsel(cmpopr, relid, attno,
1577                                                            prefixcon, SEL_CONSTANT|SEL_RIGHT);
1578                 pfree(DatumGetPointer(prefixcon));
1579                 pfree(greaterstr);
1580
1581                 /*
1582                  * Merge the two selectivities in the same way as for
1583                  * a range query (see clauselist_selectivity()).
1584                  */
1585                 prefixsel = topsel + prefixsel - 1.0;
1586
1587                 /*
1588                  * A zero or slightly negative prefixsel should be converted into a
1589                  * small positive value; we probably are dealing with a very
1590                  * tight range and got a bogus result due to roundoff errors.
1591                  * However, if prefixsel is very negative, then we probably have
1592                  * default selectivity estimates on one or both sides of the
1593                  * range.  In that case, insert a not-so-wildly-optimistic
1594                  * default estimate.
1595                  */
1596                 if (prefixsel <= 0.0)
1597                 {
1598                         if (prefixsel < -0.01)
1599                         {
1600
1601                                 /*
1602                                  * No data available --- use a default estimate that
1603                                  * is small, but not real small.
1604                                  */
1605                                 prefixsel = 0.01;
1606                         }
1607                         else
1608                         {
1609
1610                                 /*
1611                                  * It's just roundoff error; use a small positive value
1612                                  */
1613                                 prefixsel = 1.0e-10;
1614                         }
1615                 }
1616         }
1617
1618         return prefixsel;
1619 }
1620
1621
1622 /*
1623  * Estimate the selectivity of a pattern of the specified type.
1624  * Note that any fixed prefix of the pattern will have been removed already.
1625  *
1626  * For now, we use a very simplistic approach: fixed characters reduce the
1627  * selectivity a good deal, character ranges reduce it a little,
1628  * wildcards (such as % for LIKE or .* for regex) increase it.
1629  */
1630
1631 #define FIXED_CHAR_SEL  0.04    /* about 1/25 */
1632 #define CHAR_RANGE_SEL  0.25
1633 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match end-of-string */
1634 #define FULL_WILDCARD_SEL 5.0
1635 #define PARTIAL_WILDCARD_SEL 2.0
1636
1637 static Selectivity
1638 like_selectivity(char *patt)
1639 {
1640         Selectivity             sel = 1.0;
1641         int                             pos;
1642
1643         /* Skip any leading %; it's already factored into initial sel */
1644         pos = (*patt == '%') ? 1 : 0;
1645         for (; patt[pos]; pos++)
1646         {
1647                 /* % and _ are wildcard characters in LIKE */
1648                 if (patt[pos] == '%')
1649                         sel *= FULL_WILDCARD_SEL;
1650                 else if (patt[pos] == '_')
1651                         sel *= ANY_CHAR_SEL;
1652                 else if (patt[pos] == '\\')
1653                 {
1654                         /* Backslash quotes the next character */
1655                         pos++;
1656                         if (patt[pos] == '\0')
1657                                 break;
1658                         sel *= FIXED_CHAR_SEL;
1659                 }
1660                 else
1661                         sel *= FIXED_CHAR_SEL;
1662         }
1663         /* Could get sel > 1 if multiple wildcards */
1664         if (sel > 1.0)
1665                 sel = 1.0;
1666         return sel;
1667 }
1668
1669 static Selectivity
1670 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
1671 {
1672         Selectivity             sel = 1.0;
1673         int                             paren_depth = 0;
1674         int                             paren_pos = 0; /* dummy init to keep compiler quiet */
1675         int                             pos;
1676
1677         for (pos = 0; pos < pattlen; pos++)
1678         {
1679                 if (patt[pos] == '(')
1680                 {
1681                         if (paren_depth == 0)
1682                                 paren_pos = pos; /* remember start of parenthesized item */
1683                         paren_depth++;
1684                 }
1685                 else if (patt[pos] == ')' && paren_depth > 0)
1686                 {
1687                         paren_depth--;
1688                         if (paren_depth == 0)
1689                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1690                                                                                          pos - (paren_pos + 1),
1691                                                                                          case_insensitive);
1692                 }
1693                 else if (patt[pos] == '|' && paren_depth == 0)
1694                 {
1695                         /*
1696                          * If unquoted | is present at paren level 0 in pattern,
1697                          * we have multiple alternatives; sum their probabilities.
1698                          */
1699                         sel += regex_selectivity_sub(patt + (pos + 1),
1700                                                                                  pattlen - (pos + 1),
1701                                                                                  case_insensitive);
1702                         break;                          /* rest of pattern is now processed */
1703                 }
1704                 else if (patt[pos] == '[')
1705                 {
1706                         bool    negclass = false;
1707
1708                         if (patt[++pos] == '^')
1709                         {
1710                                 negclass = true;
1711                                 pos++;
1712                         }
1713                         if (patt[pos] == ']') /* ']' at start of class is not special */
1714                                 pos++;
1715                         while (pos < pattlen && patt[pos] != ']')
1716                                 pos++;
1717                         if (paren_depth == 0)
1718                                 sel *= (negclass ? (1.0-CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1719                 }
1720                 else if (patt[pos] == '.')
1721                 {
1722                         if (paren_depth == 0)
1723                                 sel *= ANY_CHAR_SEL;
1724                 }
1725                 else if (patt[pos] == '*' ||
1726                                  patt[pos] == '?' ||
1727                                  patt[pos] == '+')
1728                 {
1729                         /* Ought to be smarter about quantifiers... */
1730                         if (paren_depth == 0)
1731                                 sel *= PARTIAL_WILDCARD_SEL;
1732                 }
1733                 else if (patt[pos] == '{')
1734                 {
1735                         while (pos < pattlen && patt[pos] != '}')
1736                                 pos++;
1737                         if (paren_depth == 0)
1738                                 sel *= PARTIAL_WILDCARD_SEL;
1739                 }
1740                 else if (patt[pos] == '\\')
1741                 {
1742                         /* backslash quotes the next character */
1743                         pos++;
1744                         if (pos >= pattlen)
1745                                 break;
1746                         if (paren_depth == 0)
1747                                 sel *= FIXED_CHAR_SEL;
1748                 }
1749                 else
1750                 {
1751                         if (paren_depth == 0)
1752                                 sel *= FIXED_CHAR_SEL;
1753                 }
1754         }
1755         /* Could get sel > 1 if multiple wildcards */
1756         if (sel > 1.0)
1757                 sel = 1.0;
1758         return sel;
1759 }
1760
1761 static Selectivity
1762 regex_selectivity(char *patt, bool case_insensitive)
1763 {
1764         Selectivity             sel;
1765         int                             pattlen = strlen(patt);
1766
1767         /* If patt doesn't end with $, consider it to have a trailing wildcard */
1768         if (pattlen > 0 && patt[pattlen-1] == '$' &&
1769                 (pattlen == 1 || patt[pattlen-2] != '\\'))
1770         {
1771                 /* has trailing $ */
1772                 sel = regex_selectivity_sub(patt, pattlen-1, case_insensitive);
1773         }
1774         else
1775         {
1776                 /* no trailing $ */
1777                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1778                 sel *= FULL_WILDCARD_SEL;
1779                 if (sel > 1.0)
1780                         sel = 1.0;
1781         }
1782         return sel;
1783 }
1784
1785 static Selectivity
1786 pattern_selectivity(char *patt, Pattern_Type ptype)
1787 {
1788         Selectivity result;
1789
1790         switch (ptype)
1791         {
1792                 case Pattern_Type_Like:
1793                         result = like_selectivity(patt);
1794                         break;
1795                 case Pattern_Type_Regex:
1796                         result = regex_selectivity(patt, false);
1797                         break;
1798                 case Pattern_Type_Regex_IC:
1799                         result = regex_selectivity(patt, true);
1800                         break;
1801                 default:
1802                         elog(ERROR, "pattern_selectivity: bogus ptype");
1803                         result = 1.0;           /* keep compiler quiet */
1804                         break;
1805         }
1806         return result;
1807 }
1808
1809
1810 /*
1811  * Try to generate a string greater than the given string or any string it is
1812  * a prefix of.  If successful, return a palloc'd string; else return NULL.
1813  *
1814  * To work correctly in non-ASCII locales with weird collation orders,
1815  * we cannot simply increment "foo" to "fop" --- we have to check whether
1816  * we actually produced a string greater than the given one.  If not,
1817  * increment the righthand byte again and repeat.  If we max out the righthand
1818  * byte, truncate off the last character and start incrementing the next.
1819  * For example, if "z" were the last character in the sort order, then we
1820  * could produce "foo" as a string greater than "fonz".
1821  *
1822  * This could be rather slow in the worst case, but in most cases we won't
1823  * have to try more than one or two strings before succeeding.
1824  *
1825  * XXX in a sufficiently weird locale, this might produce incorrect results?
1826  * For example, in German I believe "ss" is treated specially --- if we are
1827  * given "foos" and return "foot", will this actually be greater than "fooss"?
1828  */
1829 char *
1830 make_greater_string(const char *str, Oid datatype)
1831 {
1832         char       *workstr;
1833         int                     len;
1834
1835         /*
1836          * Make a modifiable copy, which will be our return value if
1837          * successful
1838          */
1839         workstr = pstrdup((char *) str);
1840
1841         while ((len = strlen(workstr)) > 0)
1842         {
1843                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
1844
1845                 /*
1846                  * Try to generate a larger string by incrementing the last byte.
1847                  */
1848                 while (*lastchar < (unsigned char) 255)
1849                 {
1850                         (*lastchar)++;
1851                         if (string_lessthan(str, workstr, datatype))
1852                                 return workstr; /* Success! */
1853                 }
1854
1855                 /*
1856                  * Truncate off the last character, which might be more than 1
1857                  * byte in MULTIBYTE case.
1858                  */
1859 #ifdef MULTIBYTE
1860                 len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
1861                 workstr[len] = '\0';
1862 #else
1863                 *lastchar = '\0';
1864 #endif
1865         }
1866
1867         /* Failed... */
1868         pfree(workstr);
1869         return NULL;
1870 }
1871
1872 /*
1873  * Test whether two strings are "<" according to the rules of the given
1874  * datatype.  We do this the hard way, ie, actually calling the type's
1875  * "<" operator function, to ensure we get the right result...
1876  */
1877 static bool
1878 string_lessthan(const char *str1, const char *str2, Oid datatype)
1879 {
1880         Datum           datum1 = string_to_datum(str1, datatype);
1881         Datum           datum2 = string_to_datum(str2, datatype);
1882         bool            result;
1883
1884         switch (datatype)
1885         {
1886                 case TEXTOID:
1887                         result = text_lt((text *) datum1, (text *) datum2);
1888                         break;
1889
1890                 case BPCHAROID:
1891                         result = bpcharlt((char *) datum1, (char *) datum2);
1892                         break;
1893
1894                 case VARCHAROID:
1895                         result = varcharlt((char *) datum1, (char *) datum2);
1896                         break;
1897
1898                 case NAMEOID:
1899                         result = namelt((NameData *) datum1, (NameData *) datum2);
1900                         break;
1901
1902                 default:
1903                         elog(ERROR, "string_lessthan: unexpected datatype %u", datatype);
1904                         result = false;
1905                         break;
1906         }
1907
1908         pfree(DatumGetPointer(datum1));
1909         pfree(DatumGetPointer(datum2));
1910
1911         return result;
1912 }
1913
1914 /* See if there is a binary op of the given name for the given datatype */
1915 static Oid
1916 find_operator(const char *opname, Oid datatype)
1917 {
1918         HeapTuple       optup;
1919
1920         optup = SearchSysCacheTuple(OPERNAME,
1921                                                                 PointerGetDatum(opname),
1922                                                                 ObjectIdGetDatum(datatype),
1923                                                                 ObjectIdGetDatum(datatype),
1924                                                                 CharGetDatum('b'));
1925         if (!HeapTupleIsValid(optup))
1926                 return InvalidOid;
1927         return optup->t_data->t_oid;
1928 }
1929
1930 /*
1931  * Generate a Datum of the appropriate type from a C string.
1932  * Note that all of the supported types are pass-by-ref, so the
1933  * returned value should be pfree'd if no longer needed.
1934  */
1935 static Datum
1936 string_to_datum(const char *str, Oid datatype)
1937 {
1938
1939         /*
1940          * We cheat a little by assuming that textin() will do for bpchar and
1941          * varchar constants too...
1942          */
1943         if (datatype == NAMEOID)
1944                 return PointerGetDatum(namein((char *) str));
1945         else
1946                 return PointerGetDatum(textin((char *) str));
1947 }
1948
1949 /*-------------------------------------------------------------------------
1950  *
1951  * Index cost estimation functions
1952  *
1953  * genericcostestimate is a general-purpose estimator for use when we
1954  * don't have any better idea about how to estimate.  Index-type-specific
1955  * knowledge can be incorporated in the type-specific routines.
1956  *
1957  *-------------------------------------------------------------------------
1958  */
1959
1960 static void
1961 genericcostestimate(Query *root, RelOptInfo *rel,
1962                                         IndexOptInfo *index, List *indexQuals,
1963                                         Cost *indexStartupCost,
1964                                         Cost *indexTotalCost,
1965                                         Selectivity *indexSelectivity)
1966 {
1967         double          numIndexTuples;
1968         double          numIndexPages;
1969
1970         /* Estimate the fraction of main-table tuples that will be visited */
1971         *indexSelectivity = clauselist_selectivity(root, indexQuals,
1972                                                                                            lfirsti(rel->relids));
1973
1974         /* Estimate the number of index tuples that will be visited */
1975         numIndexTuples = *indexSelectivity * index->tuples;
1976
1977         /* Estimate the number of index pages that will be retrieved */
1978         numIndexPages = *indexSelectivity * index->pages;
1979
1980         /*
1981          * Always estimate at least one tuple and page are touched, even when
1982          * indexSelectivity estimate is tiny.
1983          */
1984         if (numIndexTuples < 1.0)
1985                 numIndexTuples = 1.0;
1986         if (numIndexPages < 1.0)
1987                 numIndexPages = 1.0;
1988
1989         /*
1990          * Compute the index access cost.
1991          *
1992          * Our generic assumption is that the index pages will be read
1993          * sequentially, so they have cost 1.0 each, not random_page_cost.
1994          * Also, we charge for evaluation of the indexquals at each index
1995          * tuple. All the costs are assumed to be paid incrementally during
1996          * the scan.
1997          */
1998         *indexStartupCost = 0;
1999         *indexTotalCost = numIndexPages +
2000                 (cpu_index_tuple_cost + cost_qual_eval(indexQuals)) * numIndexTuples;
2001 }
2002
2003 /*
2004  * For first cut, just use generic function for all index types.
2005  */
2006
2007 void
2008 btcostestimate(Query *root, RelOptInfo *rel,
2009                            IndexOptInfo *index, List *indexQuals,
2010                            Cost *indexStartupCost,
2011                            Cost *indexTotalCost,
2012                            Selectivity *indexSelectivity)
2013 {
2014         genericcostestimate(root, rel, index, indexQuals,
2015                                          indexStartupCost, indexTotalCost, indexSelectivity);
2016 }
2017
2018 void
2019 rtcostestimate(Query *root, RelOptInfo *rel,
2020                            IndexOptInfo *index, List *indexQuals,
2021                            Cost *indexStartupCost,
2022                            Cost *indexTotalCost,
2023                            Selectivity *indexSelectivity)
2024 {
2025         genericcostestimate(root, rel, index, indexQuals,
2026                                          indexStartupCost, indexTotalCost, indexSelectivity);
2027 }
2028
2029 void
2030 hashcostestimate(Query *root, RelOptInfo *rel,
2031                                  IndexOptInfo *index, List *indexQuals,
2032                                  Cost *indexStartupCost,
2033                                  Cost *indexTotalCost,
2034                                  Selectivity *indexSelectivity)
2035 {
2036         genericcostestimate(root, rel, index, indexQuals,
2037                                          indexStartupCost, indexTotalCost, indexSelectivity);
2038 }
2039
2040 void
2041 gistcostestimate(Query *root, RelOptInfo *rel,
2042                                  IndexOptInfo *index, List *indexQuals,
2043                                  Cost *indexStartupCost,
2044                                  Cost *indexTotalCost,
2045                                  Selectivity *indexSelectivity)
2046 {
2047         genericcostestimate(root, rel, index, indexQuals,
2048                                          indexStartupCost, indexTotalCost, indexSelectivity);
2049 }