]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c
Reduce eqsel()'s fudge-factor for estimating the frequency of values
[postgresql] / src / backend / utils / adt / selfuncs.c
1 /*-------------------------------------------------------------------------
2  *
3  * selfuncs.c
4  *        Selectivity functions and index cost estimation functions for
5  *        standard operators and index access methods.
6  *
7  *        Selectivity routines are registered in the pg_operator catalog
8  *        in the "oprrest" and "oprjoin" attributes.
9  *
10  *        Index cost functions are registered in the pg_am catalog
11  *        in the "amcostestimate" attribute.
12  *
13  * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  *
17  * IDENTIFICATION
18  *        $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.66 2000/05/26 17:19:15 tgl Exp $
19  *
20  *-------------------------------------------------------------------------
21  */
22
23 #include "postgres.h"
24
25 #include <ctype.h>
26 #include <math.h>
27
28 #include "access/heapam.h"
29 #include "catalog/catname.h"
30 #include "catalog/pg_operator.h"
31 #include "catalog/pg_proc.h"
32 #include "catalog/pg_statistic.h"
33 #include "catalog/pg_type.h"
34 #include "mb/pg_wchar.h"
35 #include "optimizer/cost.h"
36 #include "parser/parse_func.h"
37 #include "parser/parse_oper.h"
38 #include "utils/builtins.h"
39 #include "utils/int8.h"
40 #include "utils/lsyscache.h"
41 #include "utils/syscache.h"
42
43 /* N is not a valid var/constant or relation id */
44 #define NONVALUE(N)             ((N) == 0)
45
46 /* are we looking at a functional index selectivity request? */
47 #define FunctionalSelectivity(nIndKeys,attNum) ((attNum)==InvalidAttrNumber)
48
49 /* default selectivity estimate for equalities such as "A = b" */
50 #define DEFAULT_EQ_SEL  0.01
51
52 /* default selectivity estimate for inequalities such as "A < b" */
53 #define DEFAULT_INEQ_SEL  (1.0 / 3.0)
54
55 /* default selectivity estimate for pattern-match operators such as LIKE */
56 #define DEFAULT_MATCH_SEL       0.01
57
58 /* "fudge factor" for estimating frequency of not-most-common values */
59 #define NOT_MOST_COMMON_RATIO  0.1
60
61 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
62                                                           Datum lobound, Datum hibound, Oid boundstypid,
63                                                           double *scaledlobound, double *scaledhibound);
64 static double convert_numeric_to_scalar(Datum value, Oid typid);
65 static void convert_string_to_scalar(unsigned char *value,
66                                                                          double *scaledvalue,
67                                                                          unsigned char *lobound,
68                                                                          double *scaledlobound,
69                                                                          unsigned char *hibound,
70                                                                          double *scaledhibound);
71 static double convert_one_string_to_scalar(unsigned char *value,
72                                                                                    int rangelo, int rangehi);
73 static unsigned char * convert_string_datum(Datum value, Oid typid);
74 static double convert_timevalue_to_scalar(Datum value, Oid typid);
75 static void getattproperties(Oid relid, AttrNumber attnum,
76                                  Oid *typid,
77                                  int *typlen,
78                                  bool *typbyval,
79                                  int32 *typmod);
80 static bool getattstatistics(Oid relid, AttrNumber attnum,
81                                  Oid typid, int32 typmod,
82                                  double *nullfrac,
83                                  double *commonfrac,
84                                  Datum *commonval,
85                                  Datum *loval,
86                                  Datum *hival);
87 static Selectivity prefix_selectivity(char *prefix,
88                                                                           Oid relid,
89                                                                           AttrNumber attno,
90                                                                           Oid datatype);
91 static Selectivity pattern_selectivity(char *patt, Pattern_Type ptype);
92 static bool string_lessthan(const char *str1, const char *str2,
93                                 Oid datatype);
94 static Oid      find_operator(const char *opname, Oid datatype);
95 static Datum string_to_datum(const char *str, Oid datatype);
96
97
98 /*
99  *              eqsel                   - Selectivity of "=" for any data types.
100  *
101  * Note: this routine is also used to estimate selectivity for some
102  * operators that are not "=" but have comparable selectivity behavior,
103  * such as "~=" (geometric approximate-match).  Even for "=", we must
104  * keep in mind that the left and right datatypes may differ, so the type
105  * of the given constant "value" may be different from the type of the
106  * attribute.
107  */
108 float64
109 eqsel(Oid opid,
110           Oid relid,
111           AttrNumber attno,
112           Datum value,
113           int32 flag)
114 {
115         float64         result;
116
117         result = (float64) palloc(sizeof(float64data));
118         if (NONVALUE(attno) || NONVALUE(relid))
119                 *result = DEFAULT_EQ_SEL;
120         else
121         {
122                 Oid                     typid;
123                 int                     typlen;
124                 bool            typbyval;
125                 int32           typmod;
126                 double          nullfrac;
127                 double          commonfrac;
128                 Datum           commonval;
129                 double          selec;
130
131                 /* get info about the attribute */
132                 getattproperties(relid, attno,
133                                                  &typid, &typlen, &typbyval, &typmod);
134
135                 /* get stats for the attribute, if available */
136                 if (getattstatistics(relid, attno, typid, typmod,
137                                                          &nullfrac, &commonfrac, &commonval,
138                                                          NULL, NULL))
139                 {
140                         if (flag & SEL_CONSTANT)
141                         {
142
143                                 /*
144                                  * Is the constant "=" to the column's most common value?
145                                  * (Although the operator may not really be "=", we will
146                                  * assume that seeing whether it returns TRUE for the most
147                                  * common value is useful information. If you don't like
148                                  * it, maybe you shouldn't be using eqsel for your
149                                  * operator...)
150                                  */
151                                 RegProcedure eqproc = get_opcode(opid);
152                                 bool            mostcommon;
153
154                                 if (eqproc == (RegProcedure) NULL)
155                                         elog(ERROR, "eqsel: no procedure for operator %u",
156                                                  opid);
157
158                                 /* be careful to apply operator right way 'round */
159                                 if (flag & SEL_RIGHT)
160                                         mostcommon = (bool)
161                                                 DatumGetUInt8(fmgr(eqproc, commonval, value));
162                                 else
163                                         mostcommon = (bool)
164                                                 DatumGetUInt8(fmgr(eqproc, value, commonval));
165
166                                 if (mostcommon)
167                                 {
168
169                                         /*
170                                          * Constant is "=" to the most common value.  We know
171                                          * selectivity exactly (or as exactly as VACUUM could
172                                          * calculate it, anyway).
173                                          */
174                                         selec = commonfrac;
175                                 }
176                                 else
177                                 {
178
179                                         /*
180                                          * Comparison is against a constant that is neither
181                                          * the most common value nor null.      Its selectivity
182                                          * cannot be more than this:
183                                          */
184                                         selec = 1.0 - commonfrac - nullfrac;
185                                         if (selec > commonfrac)
186                                                 selec = commonfrac;
187
188                                         /*
189                                          * and in fact it's probably less, so we should apply
190                                          * a fudge factor.      The only case where we don't is
191                                          * for a boolean column, where indeed we have
192                                          * estimated the less-common value's frequency
193                                          * exactly!
194                                          */
195                                         if (typid != BOOLOID)
196                                                 selec *= NOT_MOST_COMMON_RATIO;
197                                 }
198                         }
199                         else
200                         {
201
202                                 /*
203                                  * Search is for a value that we do not know a priori, but
204                                  * we will assume it is not NULL.  Selectivity cannot be
205                                  * more than this:
206                                  */
207                                 selec = 1.0 - nullfrac;
208                                 if (selec > commonfrac)
209                                         selec = commonfrac;
210
211                                 /*
212                                  * and in fact it's probably less, so apply a fudge
213                                  * factor.
214                                  */
215                                 selec *= NOT_MOST_COMMON_RATIO;
216                         }
217
218                         /* result should be in range, but make sure... */
219                         if (selec < 0.0)
220                                 selec = 0.0;
221                         else if (selec > 1.0)
222                                 selec = 1.0;
223
224                         if (!typbyval)
225                                 pfree(DatumGetPointer(commonval));
226                 }
227                 else
228                 {
229
230                         /*
231                          * No VACUUM ANALYZE stats available, so make a guess using
232                          * the disbursion stat (if we have that, which is unlikely for
233                          * a normal attribute; but for a system attribute we may be
234                          * able to estimate it).
235                          */
236                         selec = get_attdisbursion(relid, attno, 0.01);
237                 }
238
239                 *result = (float64data) selec;
240         }
241         return result;
242 }
243
244 /*
245  *              neqsel                  - Selectivity of "!=" for any data types.
246  *
247  * This routine is also used for some operators that are not "!="
248  * but have comparable selectivity behavior.  See above comments
249  * for eqsel().
250  */
251 float64
252 neqsel(Oid opid,
253            Oid relid,
254            AttrNumber attno,
255            Datum value,
256            int32 flag)
257 {
258         float64         result;
259
260         result = eqsel(opid, relid, attno, value, flag);
261         *result = 1.0 - *result;
262         return result;
263 }
264
265 /*
266  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
267  *
268  * This routine works for any datatype (or pair of datatypes) known to
269  * convert_to_scalar().  If it is applied to some other datatype,
270  * it will return a default estimate.
271  */
272 float64
273 scalarltsel(Oid opid,
274                         Oid relid,
275                         AttrNumber attno,
276                         Datum value,
277                         int32 flag)
278 {
279         float64         result;
280
281         result = (float64) palloc(sizeof(float64data));
282         if (!(flag & SEL_CONSTANT) || NONVALUE(attno) || NONVALUE(relid))
283                 *result = DEFAULT_INEQ_SEL;
284         else
285         {
286                 HeapTuple       oprtuple;
287                 Oid                     ltype,
288                                         rtype,
289                                         contype;
290                 Oid                     typid;
291                 int                     typlen;
292                 bool            typbyval;
293                 int32           typmod;
294                 Datum           hival,
295                                         loval;
296                 double          val,
297                                         high,
298                                         low,
299                                         numerator,
300                                         denominator;
301
302                 /*
303                  * Get left and right datatypes of the operator so we know what
304                  * type the constant is.
305                  */
306                 oprtuple = get_operator_tuple(opid);
307                 if (!HeapTupleIsValid(oprtuple))
308                         elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
309                 ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
310                 rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
311                 contype = (flag & SEL_RIGHT) ? rtype : ltype;
312
313                 /* Now get info and stats about the attribute */
314                 getattproperties(relid, attno,
315                                                  &typid, &typlen, &typbyval, &typmod);
316
317                 if (!getattstatistics(relid, attno, typid, typmod,
318                                                           NULL, NULL, NULL,
319                                                           &loval, &hival))
320                 {
321                         /* no stats available, so default result */
322                         *result = DEFAULT_INEQ_SEL;
323                         return result;
324                 }
325
326                 /* Convert the values to a uniform comparison scale. */
327                 if (!convert_to_scalar(value, contype, &val,
328                                                            loval, hival, typid,
329                                                            &low, &high))
330                 {
331
332                         /*
333                          * Ideally we'd produce an error here, on the grounds that the
334                          * given operator shouldn't have scalarltsel registered as its
335                          * selectivity func unless we can deal with its operand types.
336                          * But currently, all manner of stuff is invoking scalarltsel,
337                          * so give a default estimate until that can be fixed.
338                          */
339                         if (!typbyval)
340                         {
341                                 pfree(DatumGetPointer(hival));
342                                 pfree(DatumGetPointer(loval));
343                         }
344                         *result = DEFAULT_INEQ_SEL;
345                         return result;
346                 }
347
348                 /* release temp storage if needed */
349                 if (!typbyval)
350                 {
351                         pfree(DatumGetPointer(hival));
352                         pfree(DatumGetPointer(loval));
353                 }
354
355                 if (high <= low)
356                 {
357
358                         /*
359                          * If we trusted the stats fully, we could return a small or
360                          * large selec depending on which side of the single data
361                          * point the constant is on.  But it seems better to assume
362                          * that the stats are wrong and return a default...
363                          */
364                         *result = DEFAULT_INEQ_SEL;
365                 }
366                 else if (val < low || val > high)
367                 {
368
369                         /*
370                          * If given value is outside the statistical range, return a
371                          * small or large value; but not 0.0/1.0 since there is a
372                          * chance the stats are out of date.
373                          */
374                         if (flag & SEL_RIGHT)
375                                 *result = (val < low) ? 0.001 : 0.999;
376                         else
377                                 *result = (val < low) ? 0.999 : 0.001;
378                 }
379                 else
380                 {
381                         denominator = high - low;
382                         if (flag & SEL_RIGHT)
383                                 numerator = val - low;
384                         else
385                                 numerator = high - val;
386                         *result = numerator / denominator;
387                 }
388         }
389         return result;
390 }
391
392 /*
393  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
394  *
395  * See above comments for scalarltsel.
396  */
397 float64
398 scalargtsel(Oid opid,
399                         Oid relid,
400                         AttrNumber attno,
401                         Datum value,
402                         int32 flag)
403 {
404         float64         result;
405
406         /*
407          * Compute selectivity of "<", then invert --- but only if we were
408          * able to produce a non-default estimate.
409          */
410         result = scalarltsel(opid, relid, attno, value, flag);
411         if (*result != DEFAULT_INEQ_SEL)
412                 *result = 1.0 - *result;
413         return result;
414 }
415
416 /*
417  * patternsel                   - Generic code for pattern-match selectivity.
418  */
419 static float64
420 patternsel(Oid opid,
421                    Pattern_Type ptype,
422                    Oid relid,
423                    AttrNumber attno,
424                    Datum value,
425                    int32 flag)
426 {
427         float64         result;
428
429         result = (float64) palloc(sizeof(float64data));
430         /* Must have a constant for the pattern, or cannot learn anything */
431         if ((flag & (SEL_CONSTANT | SEL_RIGHT)) != (SEL_CONSTANT | SEL_RIGHT))
432                 *result = DEFAULT_MATCH_SEL;
433         else
434         {
435                 HeapTuple       oprtuple;
436                 Oid                     ltype,
437                                         rtype;
438                 char       *patt;
439                 Pattern_Prefix_Status pstatus;
440                 char       *prefix;
441                 char       *rest;
442
443                 /*
444                  * Get left and right datatypes of the operator so we know what
445                  * type the attribute is.
446                  */
447                 oprtuple = get_operator_tuple(opid);
448                 if (!HeapTupleIsValid(oprtuple))
449                         elog(ERROR, "patternsel: no tuple for operator %u", opid);
450                 ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
451                 rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
452
453                 /* the right-hand const is type text for all supported operators */
454                 Assert(rtype == TEXTOID);
455                 patt = textout((text *) DatumGetPointer(value));
456
457                 /* divide pattern into fixed prefix and remainder */
458                 pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
459
460                 if (pstatus == Pattern_Prefix_Exact)
461                 {
462                         /* Pattern specifies an exact match, so pretend operator is '=' */
463                         Oid             eqopr = find_operator("=", ltype);
464                         Datum   eqcon;
465
466                         if (eqopr == InvalidOid)
467                                 elog(ERROR, "patternsel: no = operator for type %u", ltype);
468                         eqcon = string_to_datum(prefix, ltype);
469                         result = eqsel(eqopr, relid, attno, eqcon, SEL_CONSTANT|SEL_RIGHT);
470                         pfree(DatumGetPointer(eqcon));
471                 }
472                 else
473                 {
474                         /*
475                          * Not exact-match pattern.  We estimate selectivity of the
476                          * fixed prefix and remainder of pattern separately, then
477                          * combine the two.
478                          */
479                         Selectivity prefixsel;
480                         Selectivity restsel;
481                         Selectivity selec;
482
483                         if (pstatus == Pattern_Prefix_Partial)
484                                 prefixsel = prefix_selectivity(prefix, relid, attno, ltype);
485                         else
486                                 prefixsel = 1.0;
487                         restsel = pattern_selectivity(rest, ptype);
488                         selec = prefixsel * restsel;
489                         /* result should be in range, but make sure... */
490                         if (selec < 0.0)
491                                 selec = 0.0;
492                         else if (selec > 1.0)
493                                 selec = 1.0;
494                         *result = (float64data) selec;
495                 }
496                 if (prefix)
497                         pfree(prefix);
498                 pfree(patt);
499         }
500         return result;
501 }
502
503 /*
504  *              regexeqsel              - Selectivity of regular-expression pattern match.
505  */
506 float64
507 regexeqsel(Oid opid,
508                    Oid relid,
509                    AttrNumber attno,
510                    Datum value,
511                    int32 flag)
512 {
513         return patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
514 }
515
516 /*
517  *              icregexeqsel    - Selectivity of case-insensitive regex match.
518  */
519 float64
520 icregexeqsel(Oid opid,
521                          Oid relid,
522                          AttrNumber attno,
523                          Datum value,
524                          int32 flag)
525 {
526         return patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
527 }
528
529 /*
530  *              likesel                 - Selectivity of LIKE pattern match.
531  */
532 float64
533 likesel(Oid opid,
534                 Oid relid,
535                 AttrNumber attno,
536                 Datum value,
537                 int32 flag)
538 {
539         return patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
540 }
541
542 /*
543  *              regexnesel              - Selectivity of regular-expression pattern non-match.
544  */
545 float64
546 regexnesel(Oid opid,
547                    Oid relid,
548                    AttrNumber attno,
549                    Datum value,
550                    int32 flag)
551 {
552         float64         result;
553
554         result = patternsel(opid, Pattern_Type_Regex, relid, attno, value, flag);
555         *result = 1.0 - *result;
556         return result;
557 }
558
559 /*
560  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
561  */
562 float64
563 icregexnesel(Oid opid,
564                          Oid relid,
565                          AttrNumber attno,
566                          Datum value,
567                          int32 flag)
568 {
569         float64         result;
570
571         result = patternsel(opid, Pattern_Type_Regex_IC, relid, attno, value, flag);
572         *result = 1.0 - *result;
573         return result;
574 }
575
576 /*
577  *              nlikesel                - Selectivity of LIKE pattern non-match.
578  */
579 float64
580 nlikesel(Oid opid,
581                  Oid relid,
582                  AttrNumber attno,
583                  Datum value,
584                  int32 flag)
585 {
586         float64         result;
587
588         result = patternsel(opid, Pattern_Type_Like, relid, attno, value, flag);
589         *result = 1.0 - *result;
590         return result;
591 }
592
593 /*
594  *              eqjoinsel               - Join selectivity of "="
595  */
596 float64
597 eqjoinsel(Oid opid,
598                   Oid relid1,
599                   AttrNumber attno1,
600                   Oid relid2,
601                   AttrNumber attno2)
602 {
603         float64         result;
604         float64data num1,
605                                 num2,
606                                 min;
607         bool            unknown1 = NONVALUE(relid1) || NONVALUE(attno1);
608         bool            unknown2 = NONVALUE(relid2) || NONVALUE(attno2);
609
610         result = (float64) palloc(sizeof(float64data));
611         if (unknown1 && unknown2)
612                 *result = DEFAULT_EQ_SEL;
613         else
614         {
615                 num1 = unknown1 ? 1.0 : get_attdisbursion(relid1, attno1, 0.01);
616                 num2 = unknown2 ? 1.0 : get_attdisbursion(relid2, attno2, 0.01);
617
618                 /*
619                  * The join selectivity cannot be more than num2, since each tuple
620                  * in table 1 could match no more than num2 fraction of tuples in
621                  * table 2 (and that's only if the table-1 tuple matches the most
622                  * common value in table 2, so probably it's less).  By the same
623                  * reasoning it is not more than num1. The min is therefore an
624                  * upper bound.
625                  *
626                  * If we know the disbursion of only one side, use it; the reasoning
627                  * above still works.
628                  *
629                  * XXX can we make a better estimate here?      Using the nullfrac
630                  * statistic might be helpful, for example.  Assuming the operator
631                  * is strict (does not succeed for null inputs) then the
632                  * selectivity couldn't be more than (1-nullfrac1)*(1-nullfrac2),
633                  * which might be usefully small if there are many nulls.  How
634                  * about applying the operator to the most common values?
635                  */
636                 min = (num1 < num2) ? num1 : num2;
637                 *result = min;
638         }
639         return result;
640 }
641
642 /*
643  *              neqjoinsel              - Join selectivity of "!="
644  */
645 float64
646 neqjoinsel(Oid opid,
647                    Oid relid1,
648                    AttrNumber attno1,
649                    Oid relid2,
650                    AttrNumber attno2)
651 {
652         float64         result;
653
654         result = eqjoinsel(opid, relid1, attno1, relid2, attno2);
655         *result = 1.0 - *result;
656         return result;
657 }
658
659 /*
660  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
661  */
662 float64
663 scalarltjoinsel(Oid opid,
664                                 Oid relid1,
665                                 AttrNumber attno1,
666                                 Oid relid2,
667                                 AttrNumber attno2)
668 {
669         float64         result;
670
671         result = (float64) palloc(sizeof(float64data));
672         *result = DEFAULT_INEQ_SEL;
673         return result;
674 }
675
676 /*
677  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
678  */
679 float64
680 scalargtjoinsel(Oid opid,
681                                 Oid relid1,
682                                 AttrNumber attno1,
683                                 Oid relid2,
684                                 AttrNumber attno2)
685 {
686         float64         result;
687
688         result = (float64) palloc(sizeof(float64data));
689         *result = DEFAULT_INEQ_SEL;
690         return result;
691 }
692
693 /*
694  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
695  */
696 float64
697 regexeqjoinsel(Oid opid,
698                            Oid relid1,
699                            AttrNumber attno1,
700                            Oid relid2,
701                            AttrNumber attno2)
702 {
703         float64         result;
704
705         result = (float64) palloc(sizeof(float64data));
706         *result = DEFAULT_MATCH_SEL;
707         return result;
708 }
709
710 /*
711  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
712  */
713 float64
714 icregexeqjoinsel(Oid opid,
715                                  Oid relid1,
716                                  AttrNumber attno1,
717                                  Oid relid2,
718                                  AttrNumber attno2)
719 {
720         float64         result;
721
722         result = (float64) palloc(sizeof(float64data));
723         *result = DEFAULT_MATCH_SEL;
724         return result;
725 }
726
727 /*
728  *              likejoinsel                     - Join selectivity of LIKE pattern match.
729  */
730 float64
731 likejoinsel(Oid opid,
732                         Oid relid1,
733                         AttrNumber attno1,
734                         Oid relid2,
735                         AttrNumber attno2)
736 {
737         float64         result;
738
739         result = (float64) palloc(sizeof(float64data));
740         *result = DEFAULT_MATCH_SEL;
741         return result;
742 }
743
744 /*
745  *              regexnejoinsel  - Join selectivity of regex non-match.
746  */
747 float64
748 regexnejoinsel(Oid opid,
749                            Oid relid1,
750                            AttrNumber attno1,
751                            Oid relid2,
752                            AttrNumber attno2)
753 {
754         float64         result;
755
756         result = regexeqjoinsel(opid, relid1, attno1, relid2, attno2);
757         *result = 1.0 - *result;
758         return result;
759 }
760
761 /*
762  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
763  */
764 float64
765 icregexnejoinsel(Oid opid,
766                                  Oid relid1,
767                                  AttrNumber attno1,
768                                  Oid relid2,
769                                  AttrNumber attno2)
770 {
771         float64         result;
772
773         result = icregexeqjoinsel(opid, relid1, attno1, relid2, attno2);
774         *result = 1.0 - *result;
775         return result;
776 }
777
778 /*
779  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
780  */
781 float64
782 nlikejoinsel(Oid opid,
783                          Oid relid1,
784                          AttrNumber attno1,
785                          Oid relid2,
786                          AttrNumber attno2)
787 {
788         float64         result;
789
790         result = likejoinsel(opid, relid1, attno1, relid2, attno2);
791         *result = 1.0 - *result;
792         return result;
793 }
794
795
796 /*
797  * convert_to_scalar
798  *        Convert non-NULL values of the indicated types to the comparison
799  *        scale needed by scalarltsel()/scalargtsel().
800  *        Returns "true" if successful.
801  *
802  * All numeric datatypes are simply converted to their equivalent
803  * "double" values.
804  *
805  * String datatypes are converted by convert_string_to_scalar(),
806  * which is explained below.  The reason why this routine deals with
807  * three values at a time, not just one, is that we need it for strings.
808  *
809  * The several datatypes representing absolute times are all converted
810  * to Timestamp, which is actually a double, and then we just use that
811  * double value.  Note this will give bad results for the various "special"
812  * values of Timestamp --- what can we do with those?
813  *
814  * The several datatypes representing relative times (intervals) are all
815  * converted to measurements expressed in seconds.
816  */
817 static bool
818 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
819                                   Datum lobound, Datum hibound, Oid boundstypid,
820                                   double *scaledlobound, double *scaledhibound)
821 {
822         switch (valuetypid)
823         {
824
825                 /*
826                  * Built-in numeric types
827                  */
828                 case BOOLOID:
829                 case INT2OID:
830                 case INT4OID:
831                 case INT8OID:
832                 case FLOAT4OID:
833                 case FLOAT8OID:
834                 case NUMERICOID:
835                 case OIDOID:
836                 case REGPROCOID:
837                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
838                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
839                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
840                         return true;
841
842                 /*
843                  * Built-in string types
844                  */
845                 case CHAROID:
846                 case BPCHAROID:
847                 case VARCHAROID:
848                 case TEXTOID:
849                 case NAMEOID:
850                 {
851                         unsigned char *valstr = convert_string_datum(value, valuetypid);
852                         unsigned char *lostr = convert_string_datum(lobound, boundstypid);
853                         unsigned char *histr = convert_string_datum(hibound, boundstypid);
854
855                         convert_string_to_scalar(valstr, scaledvalue,
856                                                                          lostr, scaledlobound,
857                                                                          histr, scaledhibound);
858                         pfree(valstr);
859                         pfree(lostr);
860                         pfree(histr);
861                         return true;
862                 }
863
864                 /*
865                  * Built-in time types
866                  */
867                 case TIMESTAMPOID:
868                 case ABSTIMEOID:
869                 case DATEOID:
870                 case INTERVALOID:
871                 case RELTIMEOID:
872                 case TINTERVALOID:
873                 case TIMEOID:
874                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
875                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
876                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
877                         return true;
878         }
879         /* Don't know how to convert */
880         return false;
881 }
882
883 /*
884  * Do convert_to_scalar()'s work for any numeric data type.
885  */
886 static double
887 convert_numeric_to_scalar(Datum value, Oid typid)
888 {
889         switch (typid)
890         {
891                 case BOOLOID:
892                         return (double) DatumGetUInt8(value);
893                 case INT2OID:
894                         return (double) DatumGetInt16(value);
895                 case INT4OID:
896                         return (double) DatumGetInt32(value);
897                 case INT8OID:
898                         return (double) (*i8tod((int64 *) DatumGetPointer(value)));
899                 case FLOAT4OID:
900                         return (double) (*DatumGetFloat32(value));
901                 case FLOAT8OID:
902                         return (double) (*DatumGetFloat64(value));
903                 case NUMERICOID:
904                         return (double) (*numeric_float8((Numeric) DatumGetPointer(value)));
905                 case OIDOID:
906                 case REGPROCOID:
907                         /* we can treat OIDs as integers... */
908                         return (double) DatumGetObjectId(value);
909         }
910         /* Can't get here unless someone tries to use scalarltsel/scalargtsel
911          * on an operator with one numeric and one non-numeric operand.
912          */
913         elog(ERROR, "convert_numeric_to_scalar: unsupported type %u", typid);
914         return 0;
915 }
916
917 /*
918  * Do convert_to_scalar()'s work for any character-string data type.
919  *
920  * String datatypes are converted to a scale that ranges from 0 to 1,
921  * where we visualize the bytes of the string as fractional digits.
922  *
923  * We do not want the base to be 256, however, since that tends to
924  * generate inflated selectivity estimates; few databases will have
925  * occurrences of all 256 possible byte values at each position.
926  * Instead, use the smallest and largest byte values seen in the bounds
927  * as the estimated range for each byte, after some fudging to deal with
928  * the fact that we probably aren't going to see the full range that way.
929  *
930  * An additional refinement is that we discard any common prefix of the
931  * three strings before computing the scaled values.  This allows us to
932  * "zoom in" when we encounter a narrow data range.  An example is a phone
933  * number database where all the values begin with the same area code.
934  */
935 static void
936 convert_string_to_scalar(unsigned char *value,
937                                                  double *scaledvalue,
938                                                  unsigned char *lobound,
939                                                  double *scaledlobound,
940                                                  unsigned char *hibound,
941                                                  double *scaledhibound)
942 {
943         int                     rangelo,
944                                 rangehi;
945         unsigned char *sptr;
946
947         rangelo = rangehi = hibound[0];
948         for (sptr = lobound; *sptr; sptr++)
949         {
950                 if (rangelo > *sptr)
951                         rangelo = *sptr;
952                 if (rangehi < *sptr)
953                         rangehi = *sptr;
954         }
955         for (sptr = hibound; *sptr; sptr++)
956         {
957                 if (rangelo > *sptr)
958                         rangelo = *sptr;
959                 if (rangehi < *sptr)
960                         rangehi = *sptr;
961         }
962         /* If range includes any upper-case ASCII chars, make it include all */
963         if (rangelo <= 'Z' && rangehi >= 'A')
964         {
965                 if (rangelo > 'A')
966                         rangelo = 'A';
967                 if (rangehi < 'Z')
968                         rangehi = 'Z';
969         }
970         /* Ditto lower-case */
971         if (rangelo <= 'z' && rangehi >= 'a')
972         {
973                 if (rangelo > 'a')
974                         rangelo = 'a';
975                 if (rangehi < 'z')
976                         rangehi = 'z';
977         }
978         /* Ditto digits */
979         if (rangelo <= '9' && rangehi >= '0')
980         {
981                 if (rangelo > '0')
982                         rangelo = '0';
983                 if (rangehi < '9')
984                         rangehi = '9';
985         }
986         /* If range includes less than 10 chars, assume we have not got enough
987          * data, and make it include regular ASCII set.
988          */
989         if (rangehi - rangelo < 9)
990         {
991                 rangelo = ' ';
992                 rangehi = 127;
993         }
994
995         /*
996          * Now strip any common prefix of the three strings.
997          */
998         while (*lobound)
999         {
1000                 if (*lobound != *hibound || *lobound != *value)
1001                         break;
1002                 lobound++, hibound++, value++;
1003         }
1004
1005         /*
1006          * Now we can do the conversions.
1007          */
1008         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
1009         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
1010         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
1011 }
1012
1013 static double
1014 convert_one_string_to_scalar(unsigned char *value, int rangelo, int rangehi)
1015 {
1016         int                     slen = strlen((char *) value);
1017         double          num,
1018                                 denom,
1019                                 base;
1020
1021         if (slen <= 0)
1022                 return 0.0;                             /* empty string has scalar value 0 */
1023
1024         /* Since base is at least 10, need not consider more than about 20 chars */
1025         if (slen > 20)
1026                 slen = 20;
1027
1028         /* Convert initial characters to fraction */
1029         base = rangehi - rangelo + 1;
1030         num = 0.0;
1031         denom = base;
1032         while (slen-- > 0)
1033         {
1034                 int             ch = *value++;
1035
1036                 if (ch < rangelo)
1037                         ch = rangelo-1;
1038                 else if (ch > rangehi)
1039                         ch = rangehi+1;
1040                 num += ((double) (ch - rangelo)) / denom;
1041                 denom *= base;
1042         }
1043
1044         return num;
1045 }
1046
1047 /*
1048  * Convert a string-type Datum into a palloc'd, null-terminated string.
1049  *
1050  * If USE_LOCALE is defined, we must pass the string through strxfrm()
1051  * before continuing, so as to generate correct locale-specific results.
1052  */
1053 static unsigned char *
1054 convert_string_datum(Datum value, Oid typid)
1055 {
1056         char       *val;
1057 #ifdef USE_LOCALE
1058         char       *xfrmstr;
1059         size_t          xfrmsize;
1060         size_t          xfrmlen;
1061 #endif
1062
1063         switch (typid)
1064         {
1065                 case CHAROID:
1066                         val = (char *) palloc(2);
1067                         val[0] = DatumGetChar(value);
1068                         val[1] = '\0';
1069                         break;
1070                 case BPCHAROID:
1071                 case VARCHAROID:
1072                 case TEXTOID:
1073                 {
1074                         char       *str = (char *) VARDATA(DatumGetPointer(value));
1075                         int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
1076
1077                         val = (char *) palloc(strlength+1);
1078                         memcpy(val, str, strlength);
1079                         val[strlength] = '\0';
1080                         break;
1081                 }
1082                 case NAMEOID:
1083                 {
1084                         NameData   *nm = (NameData *) DatumGetPointer(value);
1085
1086                         val = pstrdup(NameStr(*nm));
1087                         break;
1088                 }
1089                 default:
1090                         /* Can't get here unless someone tries to use scalarltsel
1091                          * on an operator with one string and one non-string operand.
1092                          */
1093                         elog(ERROR, "convert_string_datum: unsupported type %u", typid);
1094                         return NULL;
1095         }
1096
1097 #ifdef USE_LOCALE
1098         /* Guess that transformed string is not much bigger than original */
1099         xfrmsize = strlen(val) + 32;            /* arbitrary pad value here... */
1100         xfrmstr = (char *) palloc(xfrmsize);
1101         xfrmlen = strxfrm(xfrmstr, val, xfrmsize);
1102         if (xfrmlen >= xfrmsize)
1103         {
1104                 /* Oops, didn't make it */
1105                 pfree(xfrmstr);
1106                 xfrmstr = (char *) palloc(xfrmlen + 1);
1107                 xfrmlen = strxfrm(xfrmstr, val, xfrmlen + 1);
1108         }
1109         pfree(val);
1110         val = xfrmstr;
1111 #endif
1112
1113         return (unsigned char *) val;
1114 }
1115
1116 /*
1117  * Do convert_to_scalar()'s work for any timevalue data type.
1118  */
1119 static double
1120 convert_timevalue_to_scalar(Datum value, Oid typid)
1121 {
1122         switch (typid)
1123         {
1124                 case TIMESTAMPOID:
1125                         return *((Timestamp *) DatumGetPointer(value));
1126                 case ABSTIMEOID:
1127                         return *abstime_timestamp(value);
1128                 case DATEOID:
1129                         return *date_timestamp(value);
1130                 case INTERVALOID:
1131                 {
1132                         Interval   *interval = (Interval *) DatumGetPointer(value);
1133
1134                         /*
1135                          * Convert the month part of Interval to days using
1136                          * assumed average month length of 365.25/12.0 days.  Not
1137                          * too accurate, but plenty good enough for our purposes.
1138                          */
1139                         return interval->time +
1140                                 interval->month * (365.25 / 12.0 * 24.0 * 60.0 * 60.0);
1141                 }
1142                 case RELTIMEOID:
1143                         return (RelativeTime) DatumGetInt32(value);
1144                 case TINTERVALOID:
1145                 {
1146                         TimeInterval interval = (TimeInterval) DatumGetPointer(value);
1147
1148                         if (interval->status != 0)
1149                                 return interval->data[1] - interval->data[0];
1150                         return 0;                       /* for lack of a better idea */
1151                 }
1152                 case TIMEOID:
1153                         return *((TimeADT *) DatumGetPointer(value));
1154         }
1155         /* Can't get here unless someone tries to use scalarltsel/scalargtsel
1156          * on an operator with one timevalue and one non-timevalue operand.
1157          */
1158         elog(ERROR, "convert_timevalue_to_scalar: unsupported type %u", typid);
1159         return 0;
1160 }
1161
1162
1163 /*
1164  * getattproperties
1165  *        Retrieve pg_attribute properties for an attribute,
1166  *        including type OID, type len, type byval flag, typmod.
1167  */
1168 static void
1169 getattproperties(Oid relid, AttrNumber attnum,
1170                                  Oid *typid, int *typlen, bool *typbyval, int32 *typmod)
1171 {
1172         HeapTuple       atp;
1173         Form_pg_attribute att_tup;
1174
1175         atp = SearchSysCacheTuple(ATTNUM,
1176                                                           ObjectIdGetDatum(relid),
1177                                                           Int16GetDatum(attnum),
1178                                                           0, 0);
1179         if (!HeapTupleIsValid(atp))
1180                 elog(ERROR, "getattproperties: no attribute tuple %u %d",
1181                          relid, (int) attnum);
1182         att_tup = (Form_pg_attribute) GETSTRUCT(atp);
1183
1184         *typid = att_tup->atttypid;
1185         *typlen = att_tup->attlen;
1186         *typbyval = att_tup->attbyval;
1187         *typmod = att_tup->atttypmod;
1188 }
1189
1190 /*
1191  * getattstatistics
1192  *        Retrieve the pg_statistic data for an attribute.
1193  *        Returns 'false' if no stats are available.
1194  *
1195  * Inputs:
1196  * 'relid' and 'attnum' are the relation and attribute number.
1197  * 'typid' and 'typmod' are the type and typmod of the column,
1198  * which the caller must already have looked up.
1199  *
1200  * Outputs:
1201  * The available stats are nullfrac, commonfrac, commonval, loval, hival.
1202  * The caller need not retrieve all five --- pass NULL pointers for the
1203  * unwanted values.
1204  *
1205  * commonval, loval, hival are returned as Datums holding the internal
1206  * representation of the values.  (Note that these should be pfree'd
1207  * after use if the data type is not by-value.)
1208  */
1209 static bool
1210 getattstatistics(Oid relid,
1211                                  AttrNumber attnum,
1212                                  Oid typid,
1213                                  int32 typmod,
1214                                  double *nullfrac,
1215                                  double *commonfrac,
1216                                  Datum *commonval,
1217                                  Datum *loval,
1218                                  Datum *hival)
1219 {
1220         HeapTuple       tuple;
1221         HeapTuple       typeTuple;
1222         FmgrInfo        inputproc;
1223         Oid                     typelem;
1224         bool            isnull;
1225
1226         /*
1227          * We assume that there will only be one entry in pg_statistic for the
1228          * given rel/att, so we search WITHOUT considering the staop column.
1229          * Someday, VACUUM might store more than one entry per rel/att,
1230          * corresponding to more than one possible sort ordering defined for
1231          * the column type.  However, to make that work we will need to figure
1232          * out which staop to search for --- it's not necessarily the one we
1233          * have at hand!  (For example, we might have a '>' operator rather
1234          * than the '<' operator that will appear in staop.)
1235          */
1236         tuple = SearchSysCacheTuple(STATRELID,
1237                                                                 ObjectIdGetDatum(relid),
1238                                                                 Int16GetDatum((int16) attnum),
1239                                                                 0,
1240                                                                 0);
1241         if (!HeapTupleIsValid(tuple))
1242         {
1243                 /* no such stats entry */
1244                 return false;
1245         }
1246
1247         if (nullfrac)
1248                 *nullfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stanullfrac;
1249         if (commonfrac)
1250                 *commonfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stacommonfrac;
1251
1252         /* Get the type input proc for the column datatype */
1253         typeTuple = SearchSysCacheTuple(TYPEOID,
1254                                                                         ObjectIdGetDatum(typid),
1255                                                                         0, 0, 0);
1256         if (!HeapTupleIsValid(typeTuple))
1257                 elog(ERROR, "getattstatistics: Cache lookup failed for type %u",
1258                          typid);
1259         fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc);
1260         typelem = ((Form_pg_type) GETSTRUCT(typeTuple))->typelem;
1261
1262         /*
1263          * Values are variable-length fields, so cannot access as struct
1264          * fields. Must do it the hard way with SysCacheGetAttr.
1265          */
1266         if (commonval)
1267         {
1268                 text       *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1269                                                                                   Anum_pg_statistic_stacommonval,
1270                                                                                                    &isnull);
1271
1272                 if (isnull)
1273                 {
1274                         elog(DEBUG, "getattstatistics: stacommonval is null");
1275                         *commonval = PointerGetDatum(NULL);
1276                 }
1277                 else
1278                 {
1279                         char       *strval = textout(val);
1280
1281                         *commonval = (Datum)
1282                                 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1283                         pfree(strval);
1284                 }
1285         }
1286
1287         if (loval)
1288         {
1289                 text       *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1290                                                                                           Anum_pg_statistic_staloval,
1291                                                                                                    &isnull);
1292
1293                 if (isnull)
1294                 {
1295                         elog(DEBUG, "getattstatistics: staloval is null");
1296                         *loval = PointerGetDatum(NULL);
1297                 }
1298                 else
1299                 {
1300                         char       *strval = textout(val);
1301
1302                         *loval = (Datum)
1303                                 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1304                         pfree(strval);
1305                 }
1306         }
1307
1308         if (hival)
1309         {
1310                 text       *val = (text *) SysCacheGetAttr(STATRELID, tuple,
1311                                                                                           Anum_pg_statistic_stahival,
1312                                                                                                    &isnull);
1313
1314                 if (isnull)
1315                 {
1316                         elog(DEBUG, "getattstatistics: stahival is null");
1317                         *hival = PointerGetDatum(NULL);
1318                 }
1319                 else
1320                 {
1321                         char       *strval = textout(val);
1322
1323                         *hival = (Datum)
1324                                 (*fmgr_faddr(&inputproc)) (strval, typelem, typmod);
1325                         pfree(strval);
1326                 }
1327         }
1328
1329         return true;
1330 }
1331
1332 /*-------------------------------------------------------------------------
1333  *
1334  * Pattern analysis functions
1335  *
1336  * These routines support analysis of LIKE and regular-expression patterns
1337  * by the planner/optimizer.  It's important that they agree with the
1338  * regular-expression code in backend/regex/ and the LIKE code in
1339  * backend/utils/adt/like.c.
1340  *
1341  * Note that the prefix-analysis functions are called from
1342  * backend/optimizer/path/indxpath.c as well as from routines in this file.
1343  *
1344  *-------------------------------------------------------------------------
1345  */
1346
1347 /*
1348  * Extract the fixed prefix, if any, for a pattern.
1349  * *prefix is set to a palloc'd prefix string,
1350  * or to NULL if no fixed prefix exists for the pattern.
1351  * *rest is set to point to the remainder of the pattern after the
1352  * portion describing the fixed prefix.
1353  * The return value distinguishes no fixed prefix, a partial prefix,
1354  * or an exact-match-only pattern.
1355  */
1356
1357 static Pattern_Prefix_Status
1358 like_fixed_prefix(char *patt, char **prefix, char **rest)
1359 {
1360         char       *match;
1361         int                     pos,
1362                                 match_pos;
1363
1364         *prefix = match = palloc(strlen(patt) + 1);
1365         match_pos = 0;
1366
1367         for (pos = 0; patt[pos]; pos++)
1368         {
1369                 /* % and _ are wildcard characters in LIKE */
1370                 if (patt[pos] == '%' ||
1371                         patt[pos] == '_')
1372                         break;
1373                 /* Backslash quotes the next character */
1374                 if (patt[pos] == '\\')
1375                 {
1376                         pos++;
1377                         if (patt[pos] == '\0')
1378                                 break;
1379                 }
1380
1381                 /*
1382                  * NOTE: this code used to think that %% meant a literal %, but
1383                  * textlike() itself does not think that, and the SQL92 spec
1384                  * doesn't say any such thing either.
1385                  */
1386                 match[match_pos++] = patt[pos];
1387         }
1388
1389         match[match_pos] = '\0';
1390         *rest = &patt[pos];
1391
1392         /* in LIKE, an empty pattern is an exact match! */
1393         if (patt[pos] == '\0')
1394                 return Pattern_Prefix_Exact;    /* reached end of pattern, so exact */
1395
1396         if (match_pos > 0)
1397                 return Pattern_Prefix_Partial;
1398
1399         pfree(match);
1400         *prefix = NULL;
1401         return Pattern_Prefix_None;
1402 }
1403
1404 static Pattern_Prefix_Status
1405 regex_fixed_prefix(char *patt, bool case_insensitive,
1406                                    char **prefix, char **rest)
1407 {
1408         char       *match;
1409         int                     pos,
1410                                 match_pos,
1411                                 paren_depth;
1412
1413         /* Pattern must be anchored left */
1414         if (patt[0] != '^')
1415         {
1416                 *prefix = NULL;
1417                 *rest = patt;
1418                 return Pattern_Prefix_None;
1419         }
1420
1421         /* If unquoted | is present at paren level 0 in pattern, then there
1422          * are multiple alternatives for the start of the string.
1423          */
1424         paren_depth = 0;
1425         for (pos = 1; patt[pos]; pos++)
1426         {
1427                 if (patt[pos] == '|' && paren_depth == 0)
1428                 {
1429                         *prefix = NULL;
1430                         *rest = patt;
1431                         return Pattern_Prefix_None;
1432                 }
1433                 else if (patt[pos] == '(')
1434                         paren_depth++;
1435                 else if (patt[pos] == ')' && paren_depth > 0)
1436                         paren_depth--;
1437                 else if (patt[pos] == '\\')
1438                 {
1439                         /* backslash quotes the next character */
1440                         pos++;
1441                         if (patt[pos] == '\0')
1442                                 break;
1443                 }
1444         }
1445
1446         /* OK, allocate space for pattern */
1447         *prefix = match = palloc(strlen(patt) + 1);
1448         match_pos = 0;
1449
1450         /* note start at pos 1 to skip leading ^ */
1451         for (pos = 1; patt[pos]; pos++)
1452         {
1453                 /*
1454                  * Check for characters that indicate multiple possible matches here.
1455                  * XXX I suspect isalpha() is not an adequately locale-sensitive
1456                  * test for characters that can vary under case folding?
1457                  */
1458                 if (patt[pos] == '.' ||
1459                         patt[pos] == '(' ||
1460                         patt[pos] == '[' ||
1461                         patt[pos] == '$' ||
1462                         (case_insensitive && isalpha(patt[pos])))
1463                         break;
1464                 /*
1465                  * Check for quantifiers.  Except for +, this means the preceding
1466                  * character is optional, so we must remove it from the prefix too!
1467                  */
1468                 if (patt[pos] == '*' ||
1469                         patt[pos] == '?' ||
1470                         patt[pos] == '{')
1471                 {
1472                         if (match_pos > 0)
1473                                 match_pos--;
1474                         pos--;
1475                         break;
1476                 }
1477                 if (patt[pos] == '+')
1478                 {
1479                         pos--;
1480                         break;
1481                 }
1482                 if (patt[pos] == '\\')
1483                 {
1484                         /* backslash quotes the next character */
1485                         pos++;
1486                         if (patt[pos] == '\0')
1487                                 break;
1488                 }
1489                 match[match_pos++] = patt[pos];
1490         }
1491
1492         match[match_pos] = '\0';
1493         *rest = &patt[pos];
1494
1495         if (patt[pos] == '$' && patt[pos + 1] == '\0')
1496         {
1497                 *rest = &patt[pos + 1];
1498                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
1499         }
1500
1501         if (match_pos > 0)
1502                 return Pattern_Prefix_Partial;
1503
1504         pfree(match);
1505         *prefix = NULL;
1506         return Pattern_Prefix_None;
1507 }
1508
1509 Pattern_Prefix_Status
1510 pattern_fixed_prefix(char *patt, Pattern_Type ptype,
1511                                          char **prefix, char **rest)
1512 {
1513         Pattern_Prefix_Status result;
1514
1515         switch (ptype)
1516         {
1517                 case Pattern_Type_Like:
1518                         result = like_fixed_prefix(patt, prefix, rest);
1519                         break;
1520                 case Pattern_Type_Regex:
1521                         result = regex_fixed_prefix(patt, false, prefix, rest);
1522                         break;
1523                 case Pattern_Type_Regex_IC:
1524                         result = regex_fixed_prefix(patt, true, prefix, rest);
1525                         break;
1526                 default:
1527                         elog(ERROR, "pattern_fixed_prefix: bogus ptype");
1528                         result = Pattern_Prefix_None; /* keep compiler quiet */
1529                         break;
1530         }
1531         return result;
1532 }
1533
1534 /*
1535  * Estimate the selectivity of a fixed prefix for a pattern match.
1536  *
1537  * A fixed prefix "foo" is estimated as the selectivity of the expression
1538  * "var >= 'foo' AND var < 'fop'" (see also indxqual.c).
1539  */
1540 static Selectivity
1541 prefix_selectivity(char *prefix,
1542                                    Oid relid,
1543                                    AttrNumber attno,
1544                                    Oid datatype)
1545 {
1546         Selectivity     prefixsel;
1547         Oid                     cmpopr;
1548         Datum           prefixcon;
1549         char       *greaterstr;
1550
1551         cmpopr = find_operator(">=", datatype);
1552         if (cmpopr == InvalidOid)
1553                 elog(ERROR, "prefix_selectivity: no >= operator for type %u",
1554                          datatype);
1555         prefixcon = string_to_datum(prefix, datatype);
1556         /* Assume scalargtsel is appropriate for all supported types */
1557         prefixsel = * scalargtsel(cmpopr, relid, attno,
1558                                                           prefixcon, SEL_CONSTANT|SEL_RIGHT);
1559         pfree(DatumGetPointer(prefixcon));
1560
1561         /*
1562          * If we can create a string larger than the prefix,
1563          * say "x < greaterstr".
1564          */
1565         greaterstr = make_greater_string(prefix, datatype);
1566         if (greaterstr)
1567         {
1568                 Selectivity             topsel;
1569
1570                 cmpopr = find_operator("<", datatype);
1571                 if (cmpopr == InvalidOid)
1572                         elog(ERROR, "prefix_selectivity: no < operator for type %u",
1573                                  datatype);
1574                 prefixcon = string_to_datum(greaterstr, datatype);
1575                 /* Assume scalarltsel is appropriate for all supported types */
1576                 topsel = * scalarltsel(cmpopr, relid, attno,
1577                                                            prefixcon, SEL_CONSTANT|SEL_RIGHT);
1578                 pfree(DatumGetPointer(prefixcon));
1579                 pfree(greaterstr);
1580
1581                 /*
1582                  * Merge the two selectivities in the same way as for
1583                  * a range query (see clauselist_selectivity()).
1584                  */
1585                 prefixsel = topsel + prefixsel - 1.0;
1586
1587                 /*
1588                  * A zero or slightly negative prefixsel should be converted into a
1589                  * small positive value; we probably are dealing with a very
1590                  * tight range and got a bogus result due to roundoff errors.
1591                  * However, if prefixsel is very negative, then we probably have
1592                  * default selectivity estimates on one or both sides of the
1593                  * range.  In that case, insert a not-so-wildly-optimistic
1594                  * default estimate.
1595                  */
1596                 if (prefixsel <= 0.0)
1597                 {
1598                         if (prefixsel < -0.01)
1599                         {
1600
1601                                 /*
1602                                  * No data available --- use a default estimate that
1603                                  * is small, but not real small.
1604                                  */
1605                                 prefixsel = 0.01;
1606                         }
1607                         else
1608                         {
1609
1610                                 /*
1611                                  * It's just roundoff error; use a small positive value
1612                                  */
1613                                 prefixsel = 1.0e-10;
1614                         }
1615                 }
1616         }
1617
1618         return prefixsel;
1619 }
1620
1621
1622 /*
1623  * Estimate the selectivity of a pattern of the specified type.
1624  * Note that any fixed prefix of the pattern will have been removed already.
1625  *
1626  * For now, we use a very simplistic approach: fixed characters reduce the
1627  * selectivity a good deal, character ranges reduce it a little,
1628  * wildcards (such as % for LIKE or .* for regex) increase it.
1629  */
1630
1631 #define FIXED_CHAR_SEL  0.04    /* about 1/25 */
1632 #define CHAR_RANGE_SEL  0.25
1633 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match end-of-string */
1634 #define FULL_WILDCARD_SEL 5.0
1635 #define PARTIAL_WILDCARD_SEL 2.0
1636
1637 static Selectivity
1638 like_selectivity(char *patt)
1639 {
1640         Selectivity             sel = 1.0;
1641         int                             pos;
1642
1643         /* Skip any leading %; it's already factored into initial sel */
1644         pos = (*patt == '%') ? 1 : 0;
1645         for (; patt[pos]; pos++)
1646         {
1647                 /* % and _ are wildcard characters in LIKE */
1648                 if (patt[pos] == '%')
1649                         sel *= FULL_WILDCARD_SEL;
1650                 else if (patt[pos] == '_')
1651                         sel *= ANY_CHAR_SEL;
1652                 else if (patt[pos] == '\\')
1653                 {
1654                         /* Backslash quotes the next character */
1655                         pos++;
1656                         if (patt[pos] == '\0')
1657                                 break;
1658                         sel *= FIXED_CHAR_SEL;
1659                 }
1660                 else
1661                         sel *= FIXED_CHAR_SEL;
1662         }
1663         /* Could get sel > 1 if multiple wildcards */
1664         if (sel > 1.0)
1665                 sel = 1.0;
1666         return sel;
1667 }
1668
1669 static Selectivity
1670 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
1671 {
1672         Selectivity             sel = 1.0;
1673         int                             paren_depth = 0;
1674         int                             paren_pos = 0; /* dummy init to keep compiler quiet */
1675         int                             pos;
1676
1677         for (pos = 0; pos < pattlen; pos++)
1678         {
1679                 if (patt[pos] == '(')
1680                 {
1681                         if (paren_depth == 0)
1682                                 paren_pos = pos; /* remember start of parenthesized item */
1683                         paren_depth++;
1684                 }
1685                 else if (patt[pos] == ')' && paren_depth > 0)
1686                 {
1687                         paren_depth--;
1688                         if (paren_depth == 0)
1689                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1690                                                                                          pos - (paren_pos + 1),
1691                                                                                          case_insensitive);
1692                 }
1693                 else if (patt[pos] == '|' && paren_depth == 0)
1694                 {
1695                         /*
1696                          * If unquoted | is present at paren level 0 in pattern,
1697                          * we have multiple alternatives; sum their probabilities.
1698                          */
1699                         sel += regex_selectivity_sub(patt + (pos + 1),
1700                                                                                  pattlen - (pos + 1),
1701                                                                                  case_insensitive);
1702                         break;                          /* rest of pattern is now processed */
1703                 }
1704                 else if (patt[pos] == '[')
1705                 {
1706                         bool    negclass = false;
1707
1708                         if (patt[++pos] == '^')
1709                         {
1710                                 negclass = true;
1711                                 pos++;
1712                         }
1713                         if (patt[pos] == ']') /* ']' at start of class is not special */
1714                                 pos++;
1715                         while (pos < pattlen && patt[pos] != ']')
1716                                 pos++;
1717                         if (paren_depth == 0)
1718                                 sel *= (negclass ? (1.0-CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1719                 }
1720                 else if (patt[pos] == '.')
1721                 {
1722                         if (paren_depth == 0)
1723                                 sel *= ANY_CHAR_SEL;
1724                 }
1725                 else if (patt[pos] == '*' ||
1726                                  patt[pos] == '?' ||
1727                                  patt[pos] == '+')
1728                 {
1729                         /* Ought to be smarter about quantifiers... */
1730                         if (paren_depth == 0)
1731                                 sel *= PARTIAL_WILDCARD_SEL;
1732                 }
1733                 else if (patt[pos] == '{')
1734                 {
1735                         while (pos < pattlen && patt[pos] != '}')
1736                                 pos++;
1737                         if (paren_depth == 0)
1738                                 sel *= PARTIAL_WILDCARD_SEL;
1739                 }
1740                 else if (patt[pos] == '\\')
1741                 {
1742                         /* backslash quotes the next character */
1743                         pos++;
1744                         if (pos >= pattlen)
1745                                 break;
1746                         if (paren_depth == 0)
1747                                 sel *= FIXED_CHAR_SEL;
1748                 }
1749                 else
1750                 {
1751                         if (paren_depth == 0)
1752                                 sel *= FIXED_CHAR_SEL;
1753                 }
1754         }
1755         /* Could get sel > 1 if multiple wildcards */
1756         if (sel > 1.0)
1757                 sel = 1.0;
1758         return sel;
1759 }
1760
1761 static Selectivity
1762 regex_selectivity(char *patt, bool case_insensitive)
1763 {
1764         Selectivity             sel;
1765         int                             pattlen = strlen(patt);
1766
1767         /* If patt doesn't end with $, consider it to have a trailing wildcard */
1768         if (pattlen > 0 && patt[pattlen-1] == '$' &&
1769                 (pattlen == 1 || patt[pattlen-2] != '\\'))
1770         {
1771                 /* has trailing $ */
1772                 sel = regex_selectivity_sub(patt, pattlen-1, case_insensitive);
1773         }
1774         else
1775         {
1776                 /* no trailing $ */
1777                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1778                 sel *= FULL_WILDCARD_SEL;
1779                 if (sel > 1.0)
1780                         sel = 1.0;
1781         }
1782         return sel;
1783 }
1784
1785 static Selectivity
1786 pattern_selectivity(char *patt, Pattern_Type ptype)
1787 {
1788         Selectivity result;
1789
1790         switch (ptype)
1791         {
1792                 case Pattern_Type_Like:
1793                         result = like_selectivity(patt);
1794                         break;
1795                 case Pattern_Type_Regex:
1796                         result = regex_selectivity(patt, false);
1797                         break;
1798                 case Pattern_Type_Regex_IC:
1799                         result = regex_selectivity(patt, true);
1800                         break;
1801                 default:
1802                         elog(ERROR, "pattern_selectivity: bogus ptype");
1803                         result = 1.0;           /* keep compiler quiet */
1804                         break;
1805         }
1806         return result;
1807 }
1808
1809
1810 /*
1811  * Try to generate a string greater than the given string or any string it is
1812  * a prefix of.  If successful, return a palloc'd string; else return NULL.
1813  *
1814  * To work correctly in non-ASCII locales with weird collation orders,
1815  * we cannot simply increment "foo" to "fop" --- we have to check whether
1816  * we actually produced a string greater than the given one.  If not,
1817  * increment the righthand byte again and repeat.  If we max out the righthand
1818  * byte, truncate off the last character and start incrementing the next.
1819  * For example, if "z" were the last character in the sort order, then we
1820  * could produce "foo" as a string greater than "fonz".
1821  *
1822  * This could be rather slow in the worst case, but in most cases we won't
1823  * have to try more than one or two strings before succeeding.
1824  *
1825  * XXX in a sufficiently weird locale, this might produce incorrect results?
1826  * For example, in German I believe "ss" is treated specially --- if we are
1827  * given "foos" and return "foot", will this actually be greater than "fooss"?
1828  */
1829 char *
1830 make_greater_string(const char *str, Oid datatype)
1831 {
1832         char       *workstr;
1833         int                     len;
1834
1835         /*
1836          * Make a modifiable copy, which will be our return value if
1837          * successful
1838          */
1839         workstr = pstrdup((char *) str);
1840
1841         while ((len = strlen(workstr)) > 0)
1842         {
1843                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
1844
1845                 /*
1846                  * Try to generate a larger string by incrementing the last byte.
1847                  */
1848                 while (*lastchar < (unsigned char) 255)
1849                 {
1850                         (*lastchar)++;
1851                         if (string_lessthan(str, workstr, datatype))
1852                                 return workstr; /* Success! */
1853                 }
1854
1855                 /*
1856                  * Truncate off the last character, which might be more than 1
1857                  * byte in MULTIBYTE case.
1858                  */
1859 #ifdef MULTIBYTE
1860                 len = pg_mbcliplen((const unsigned char *) workstr, len, len - 1);
1861                 workstr[len] = '\0';
1862 #else
1863                 *lastchar = '\0';
1864 #endif
1865         }
1866
1867         /* Failed... */
1868         pfree(workstr);
1869         return NULL;
1870 }
1871
1872 /*
1873  * Test whether two strings are "<" according to the rules of the given
1874  * datatype.  We do this the hard way, ie, actually calling the type's
1875  * "<" operator function, to ensure we get the right result...
1876  */
1877 static bool
1878 string_lessthan(const char *str1, const char *str2, Oid datatype)
1879 {
1880         Datum           datum1 = string_to_datum(str1, datatype);
1881         Datum           datum2 = string_to_datum(str2, datatype);
1882         bool            result;
1883
1884         switch (datatype)
1885         {
1886                 case TEXTOID:
1887                         result = text_lt((text *) datum1, (text *) datum2);
1888                         break;
1889
1890                 case BPCHAROID:
1891                         result = bpcharlt((char *) datum1, (char *) datum2);
1892                         break;
1893
1894                 case VARCHAROID:
1895                         result = varcharlt((char *) datum1, (char *) datum2);
1896                         break;
1897
1898                 case NAMEOID:
1899                         result = namelt((NameData *) datum1, (NameData *) datum2);
1900                         break;
1901
1902                 default:
1903                         elog(ERROR, "string_lessthan: unexpected datatype %u", datatype);
1904                         result = false;
1905                         break;
1906         }
1907
1908         pfree(DatumGetPointer(datum1));
1909         pfree(DatumGetPointer(datum2));
1910
1911         return result;
1912 }
1913
1914 /* See if there is a binary op of the given name for the given datatype */
1915 static Oid
1916 find_operator(const char *opname, Oid datatype)
1917 {
1918         HeapTuple       optup;
1919
1920         optup = SearchSysCacheTuple(OPERNAME,
1921                                                                 PointerGetDatum(opname),
1922                                                                 ObjectIdGetDatum(datatype),
1923                                                                 ObjectIdGetDatum(datatype),
1924                                                                 CharGetDatum('b'));
1925         if (!HeapTupleIsValid(optup))
1926                 return InvalidOid;
1927         return optup->t_data->t_oid;
1928 }
1929
1930 /*
1931  * Generate a Datum of the appropriate type from a C string.
1932  * Note that all of the supported types are pass-by-ref, so the
1933  * returned value should be pfree'd if no longer needed.
1934  */
1935 static Datum
1936 string_to_datum(const char *str, Oid datatype)
1937 {
1938
1939         /*
1940          * We cheat a little by assuming that textin() will do for bpchar and
1941          * varchar constants too...
1942          */
1943         if (datatype == NAMEOID)
1944                 return PointerGetDatum(namein((char *) str));
1945         else
1946                 return PointerGetDatum(textin((char *) str));
1947 }
1948
1949 /*-------------------------------------------------------------------------
1950  *
1951  * Index cost estimation functions
1952  *
1953  * genericcostestimate is a general-purpose estimator for use when we
1954  * don't have any better idea about how to estimate.  Index-type-specific
1955  * knowledge can be incorporated in the type-specific routines.
1956  *
1957  *-------------------------------------------------------------------------
1958  */
1959
1960 static void
1961 genericcostestimate(Query *root, RelOptInfo *rel,
1962                                         IndexOptInfo *index, List *indexQuals,
1963                                         Cost *indexStartupCost,
1964                                         Cost *indexTotalCost,
1965                                         Selectivity *indexSelectivity)
1966 {
1967         double          numIndexTuples;
1968         double          numIndexPages;
1969
1970         /* Estimate the fraction of main-table tuples that will be visited */
1971         *indexSelectivity = clauselist_selectivity(root, indexQuals,
1972                                                                                            lfirsti(rel->relids));
1973
1974         /* Estimate the number of index tuples that will be visited */
1975         numIndexTuples = *indexSelectivity * index->tuples;
1976
1977         /* Estimate the number of index pages that will be retrieved */
1978         numIndexPages = *indexSelectivity * index->pages;
1979
1980         /*
1981          * Always estimate at least one tuple and page are touched, even when
1982          * indexSelectivity estimate is tiny.
1983          */
1984         if (numIndexTuples < 1.0)
1985                 numIndexTuples = 1.0;
1986         if (numIndexPages < 1.0)
1987                 numIndexPages = 1.0;
1988
1989         /*
1990          * Compute the index access cost.
1991          *
1992          * Our generic assumption is that the index pages will be read
1993          * sequentially, so they have cost 1.0 each, not random_page_cost.
1994          * Also, we charge for evaluation of the indexquals at each index
1995          * tuple. All the costs are assumed to be paid incrementally during
1996          * the scan.
1997          */
1998         *indexStartupCost = 0;
1999         *indexTotalCost = numIndexPages +
2000                 (cpu_index_tuple_cost + cost_qual_eval(indexQuals)) * numIndexTuples;
2001 }
2002
2003 /*
2004  * For first cut, just use generic function for all index types.
2005  */
2006
2007 void
2008 btcostestimate(Query *root, RelOptInfo *rel,
2009                            IndexOptInfo *index, List *indexQuals,
2010                            Cost *indexStartupCost,
2011                            Cost *indexTotalCost,
2012                            Selectivity *indexSelectivity)
2013 {
2014         genericcostestimate(root, rel, index, indexQuals,
2015                                          indexStartupCost, indexTotalCost, indexSelectivity);
2016 }
2017
2018 void
2019 rtcostestimate(Query *root, RelOptInfo *rel,
2020                            IndexOptInfo *index, List *indexQuals,
2021                            Cost *indexStartupCost,
2022                            Cost *indexTotalCost,
2023                            Selectivity *indexSelectivity)
2024 {
2025         genericcostestimate(root, rel, index, indexQuals,
2026                                          indexStartupCost, indexTotalCost, indexSelectivity);
2027 }
2028
2029 void
2030 hashcostestimate(Query *root, RelOptInfo *rel,
2031                                  IndexOptInfo *index, List *indexQuals,
2032                                  Cost *indexStartupCost,
2033                                  Cost *indexTotalCost,
2034                                  Selectivity *indexSelectivity)
2035 {
2036         genericcostestimate(root, rel, index, indexQuals,
2037                                          indexStartupCost, indexTotalCost, indexSelectivity);
2038 }
2039
2040 void
2041 gistcostestimate(Query *root, RelOptInfo *rel,
2042                                  IndexOptInfo *index, List *indexQuals,
2043                                  Cost *indexStartupCost,
2044                                  Cost *indexTotalCost,
2045                                  Selectivity *indexSelectivity)
2046 {
2047         genericcostestimate(root, rel, index, indexQuals,
2048                                          indexStartupCost, indexTotalCost, indexSelectivity);
2049 }