]> granicus.if.org Git - postgresql/blob - src/backend/utils/adt/selfuncs.c
Move ltree parentsel() selectivity function into /contrib/ltree.
[postgresql] / src / backend / utils / adt / selfuncs.c
1 /*-------------------------------------------------------------------------
2  *
3  * selfuncs.c
4  *        Selectivity functions and index cost estimation functions for
5  *        standard operators and index access methods.
6  *
7  *        Selectivity routines are registered in the pg_operator catalog
8  *        in the "oprrest" and "oprjoin" attributes.
9  *
10  *        Index cost functions are registered in the pg_am catalog
11  *        in the "amcostestimate" attribute.
12  *
13  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
14  * Portions Copyright (c) 1994, Regents of the University of California
15  *
16  *
17  * IDENTIFICATION
18  *        $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.201 2006/04/26 22:32:56 momjian Exp $
19  *
20  *-------------------------------------------------------------------------
21  */
22
23 /*----------
24  * Operator selectivity estimation functions are called to estimate the
25  * selectivity of WHERE clauses whose top-level operator is their operator.
26  * We divide the problem into two cases:
27  *              Restriction clause estimation: the clause involves vars of just
28  *                      one relation.
29  *              Join clause estimation: the clause involves vars of multiple rels.
30  * Join selectivity estimation is far more difficult and usually less accurate
31  * than restriction estimation.
32  *
33  * When dealing with the inner scan of a nestloop join, we consider the
34  * join's joinclauses as restriction clauses for the inner relation, and
35  * treat vars of the outer relation as parameters (a/k/a constants of unknown
36  * values).  So, restriction estimators need to be able to accept an argument
37  * telling which relation is to be treated as the variable.
38  *
39  * The call convention for a restriction estimator (oprrest function) is
40  *
41  *              Selectivity oprrest (PlannerInfo *root,
42  *                                                       Oid operator,
43  *                                                       List *args,
44  *                                                       int varRelid);
45  *
46  * root: general information about the query (rtable and RelOptInfo lists
47  * are particularly important for the estimator).
48  * operator: OID of the specific operator in question.
49  * args: argument list from the operator clause.
50  * varRelid: if not zero, the relid (rtable index) of the relation to
51  * be treated as the variable relation.  May be zero if the args list
52  * is known to contain vars of only one relation.
53  *
54  * This is represented at the SQL level (in pg_proc) as
55  *
56  *              float8 oprrest (internal, oid, internal, int4);
57  *
58  * The call convention for a join estimator (oprjoin function) is similar
59  * except that varRelid is not needed, and instead the join type is
60  * supplied:
61  *
62  *              Selectivity oprjoin (PlannerInfo *root,
63  *                                                       Oid operator,
64  *                                                       List *args,
65  *                                                       JoinType jointype);
66  *
67  *              float8 oprjoin (internal, oid, internal, int2);
68  *
69  * (We deliberately make the SQL signature different to facilitate
70  * catching errors.)
71  *----------
72  */
73
74 #include "postgres.h"
75
76 #include <ctype.h>
77 #include <math.h>
78
79 #include "access/heapam.h"
80 #include "access/nbtree.h"
81 #include "access/tuptoaster.h"
82 #include "catalog/pg_namespace.h"
83 #include "catalog/pg_opclass.h"
84 #include "catalog/pg_operator.h"
85 #include "catalog/pg_proc.h"
86 #include "catalog/pg_statistic.h"
87 #include "catalog/pg_type.h"
88 #include "mb/pg_wchar.h"
89 #include "nodes/makefuncs.h"
90 #include "optimizer/clauses.h"
91 #include "optimizer/cost.h"
92 #include "optimizer/pathnode.h"
93 #include "optimizer/paths.h"
94 #include "optimizer/plancat.h"
95 #include "optimizer/prep.h"
96 #include "optimizer/restrictinfo.h"
97 #include "optimizer/tlist.h"
98 #include "optimizer/var.h"
99 #include "parser/parse_expr.h"
100 #include "parser/parse_func.h"
101 #include "parser/parse_oper.h"
102 #include "parser/parsetree.h"
103 #include "utils/builtins.h"
104 #include "utils/date.h"
105 #include "utils/datum.h"
106 #include "utils/int8.h"
107 #include "utils/lsyscache.h"
108 #include "utils/nabstime.h"
109 #include "utils/pg_locale.h"
110 #include "utils/selfuncs.h"
111 #include "utils/syscache.h"
112
113
114 static double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
115                                                           Datum constval, double *sumcommonp);
116 static double ineq_histogram_selectivity(VariableStatData *vardata,
117                                                                                  FmgrInfo *opproc, bool isgt,
118                                                                                  Datum constval, Oid consttype);
119 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
120                                   Datum lobound, Datum hibound, Oid boundstypid,
121                                   double *scaledlobound, double *scaledhibound);
122 static double convert_numeric_to_scalar(Datum value, Oid typid);
123 static void convert_string_to_scalar(char *value,
124                                                  double *scaledvalue,
125                                                  char *lobound,
126                                                  double *scaledlobound,
127                                                  char *hibound,
128                                                  double *scaledhibound);
129 static void convert_bytea_to_scalar(Datum value,
130                                                 double *scaledvalue,
131                                                 Datum lobound,
132                                                 double *scaledlobound,
133                                                 Datum hibound,
134                                                 double *scaledhibound);
135 static double convert_one_string_to_scalar(char *value,
136                                                          int rangelo, int rangehi);
137 static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
138                                                         int rangelo, int rangehi);
139 static char *convert_string_datum(Datum value, Oid typid);
140 static double convert_timevalue_to_scalar(Datum value, Oid typid);
141 static void get_join_variables(PlannerInfo *root, List *args,
142                                    VariableStatData *vardata1,
143                                    VariableStatData *vardata2);
144 static void examine_variable(PlannerInfo *root, Node *node, int varRelid,
145                                  VariableStatData *vardata);
146 static double get_variable_numdistinct(VariableStatData *vardata);
147 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
148                                          Oid sortop, Datum *max);
149 static Selectivity prefix_selectivity(VariableStatData *vardata,
150                                                                           Oid opclass, Const *prefixcon);
151 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
152 static Datum string_to_datum(const char *str, Oid datatype);
153 static Const *string_to_const(const char *str, Oid datatype);
154 static Const *string_to_bytea_const(const char *str, size_t str_len);
155
156
157 /*
158  *              eqsel                   - Selectivity of "=" for any data types.
159  *
160  * Note: this routine is also used to estimate selectivity for some
161  * operators that are not "=" but have comparable selectivity behavior,
162  * such as "~=" (geometric approximate-match).  Even for "=", we must
163  * keep in mind that the left and right datatypes may differ.
164  */
165 Datum
166 eqsel(PG_FUNCTION_ARGS)
167 {
168         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
169         Oid                     operator = PG_GETARG_OID(1);
170         List       *args = (List *) PG_GETARG_POINTER(2);
171         int                     varRelid = PG_GETARG_INT32(3);
172         VariableStatData vardata;
173         Node       *other;
174         bool            varonleft;
175         Datum      *values;
176         int                     nvalues;
177         float4     *numbers;
178         int                     nnumbers;
179         double          selec;
180
181         /*
182          * If expression is not variable = something or something = variable, then
183          * punt and return a default estimate.
184          */
185         if (!get_restriction_variable(root, args, varRelid,
186                                                                   &vardata, &other, &varonleft))
187                 PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
188
189         /*
190          * If the something is a NULL constant, assume operator is strict and
191          * return zero, ie, operator will never return TRUE.
192          */
193         if (IsA(other, Const) &&
194                 ((Const *) other)->constisnull)
195         {
196                 ReleaseVariableStats(vardata);
197                 PG_RETURN_FLOAT8(0.0);
198         }
199
200         if (HeapTupleIsValid(vardata.statsTuple))
201         {
202                 Form_pg_statistic stats;
203
204                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
205
206                 if (IsA(other, Const))
207                 {
208                         /* Variable is being compared to a known non-null constant */
209                         Datum           constval = ((Const *) other)->constvalue;
210                         bool            match = false;
211                         int                     i;
212
213                         /*
214                          * Is the constant "=" to any of the column's most common values?
215                          * (Although the given operator may not really be "=", we will
216                          * assume that seeing whether it returns TRUE is an appropriate
217                          * test.  If you don't like this, maybe you shouldn't be using
218                          * eqsel for your operator...)
219                          */
220                         if (get_attstatsslot(vardata.statsTuple,
221                                                                  vardata.atttype, vardata.atttypmod,
222                                                                  STATISTIC_KIND_MCV, InvalidOid,
223                                                                  &values, &nvalues,
224                                                                  &numbers, &nnumbers))
225                         {
226                                 FmgrInfo        eqproc;
227
228                                 fmgr_info(get_opcode(operator), &eqproc);
229
230                                 for (i = 0; i < nvalues; i++)
231                                 {
232                                         /* be careful to apply operator right way 'round */
233                                         if (varonleft)
234                                                 match = DatumGetBool(FunctionCall2(&eqproc,
235                                                                                                                    values[i],
236                                                                                                                    constval));
237                                         else
238                                                 match = DatumGetBool(FunctionCall2(&eqproc,
239                                                                                                                    constval,
240                                                                                                                    values[i]));
241                                         if (match)
242                                                 break;
243                                 }
244                         }
245                         else
246                         {
247                                 /* no most-common-value info available */
248                                 values = NULL;
249                                 numbers = NULL;
250                                 i = nvalues = nnumbers = 0;
251                         }
252
253                         if (match)
254                         {
255                                 /*
256                                  * Constant is "=" to this common value.  We know selectivity
257                                  * exactly (or as exactly as VACUUM could calculate it,
258                                  * anyway).
259                                  */
260                                 selec = numbers[i];
261                         }
262                         else
263                         {
264                                 /*
265                                  * Comparison is against a constant that is neither NULL nor
266                                  * any of the common values.  Its selectivity cannot be more
267                                  * than this:
268                                  */
269                                 double          sumcommon = 0.0;
270                                 double          otherdistinct;
271
272                                 for (i = 0; i < nnumbers; i++)
273                                         sumcommon += numbers[i];
274                                 selec = 1.0 - sumcommon - stats->stanullfrac;
275                                 CLAMP_PROBABILITY(selec);
276
277                                 /*
278                                  * and in fact it's probably a good deal less. We approximate
279                                  * that all the not-common values share this remaining
280                                  * fraction equally, so we divide by the number of other
281                                  * distinct values.
282                                  */
283                                 otherdistinct = get_variable_numdistinct(&vardata)
284                                         - nnumbers;
285                                 if (otherdistinct > 1)
286                                         selec /= otherdistinct;
287
288                                 /*
289                                  * Another cross-check: selectivity shouldn't be estimated as
290                                  * more than the least common "most common value".
291                                  */
292                                 if (nnumbers > 0 && selec > numbers[nnumbers - 1])
293                                         selec = numbers[nnumbers - 1];
294                         }
295
296                         free_attstatsslot(vardata.atttype, values, nvalues,
297                                                           numbers, nnumbers);
298                 }
299                 else
300                 {
301                         double          ndistinct;
302
303                         /*
304                          * Search is for a value that we do not know a priori, but we will
305                          * assume it is not NULL.  Estimate the selectivity as non-null
306                          * fraction divided by number of distinct values, so that we get a
307                          * result averaged over all possible values whether common or
308                          * uncommon.  (Essentially, we are assuming that the not-yet-known
309                          * comparison value is equally likely to be any of the possible
310                          * values, regardless of their frequency in the table.  Is that a
311                          * good idea?)
312                          */
313                         selec = 1.0 - stats->stanullfrac;
314                         ndistinct = get_variable_numdistinct(&vardata);
315                         if (ndistinct > 1)
316                                 selec /= ndistinct;
317
318                         /*
319                          * Cross-check: selectivity should never be estimated as more than
320                          * the most common value's.
321                          */
322                         if (get_attstatsslot(vardata.statsTuple,
323                                                                  vardata.atttype, vardata.atttypmod,
324                                                                  STATISTIC_KIND_MCV, InvalidOid,
325                                                                  NULL, NULL,
326                                                                  &numbers, &nnumbers))
327                         {
328                                 if (nnumbers > 0 && selec > numbers[0])
329                                         selec = numbers[0];
330                                 free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
331                         }
332                 }
333         }
334         else
335         {
336                 /*
337                  * No VACUUM ANALYZE stats available, so make a guess using estimated
338                  * number of distinct values and assuming they are equally common.
339                  * (The guess is unlikely to be very good, but we do know a few
340                  * special cases.)
341                  */
342                 selec = 1.0 / get_variable_numdistinct(&vardata);
343         }
344
345         ReleaseVariableStats(vardata);
346
347         /* result should be in range, but make sure... */
348         CLAMP_PROBABILITY(selec);
349
350         PG_RETURN_FLOAT8((float8) selec);
351 }
352
353 /*
354  *              neqsel                  - Selectivity of "!=" for any data types.
355  *
356  * This routine is also used for some operators that are not "!="
357  * but have comparable selectivity behavior.  See above comments
358  * for eqsel().
359  */
360 Datum
361 neqsel(PG_FUNCTION_ARGS)
362 {
363         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
364         Oid                     operator = PG_GETARG_OID(1);
365         List       *args = (List *) PG_GETARG_POINTER(2);
366         int                     varRelid = PG_GETARG_INT32(3);
367         Oid                     eqop;
368         float8          result;
369
370         /*
371          * We want 1 - eqsel() where the equality operator is the one associated
372          * with this != operator, that is, its negator.
373          */
374         eqop = get_negator(operator);
375         if (eqop)
376         {
377                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
378                                                                                                         PointerGetDatum(root),
379                                                                                                         ObjectIdGetDatum(eqop),
380                                                                                                         PointerGetDatum(args),
381                                                                                                         Int32GetDatum(varRelid)));
382         }
383         else
384         {
385                 /* Use default selectivity (should we raise an error instead?) */
386                 result = DEFAULT_EQ_SEL;
387         }
388         result = 1.0 - result;
389         PG_RETURN_FLOAT8(result);
390 }
391
392 /*
393  *      scalarineqsel           - Selectivity of "<", "<=", ">", ">=" for scalars.
394  *
395  * This is the guts of both scalarltsel and scalargtsel.  The caller has
396  * commuted the clause, if necessary, so that we can treat the variable as
397  * being on the left.  The caller must also make sure that the other side
398  * of the clause is a non-null Const, and dissect same into a value and
399  * datatype.
400  *
401  * This routine works for any datatype (or pair of datatypes) known to
402  * convert_to_scalar().  If it is applied to some other datatype,
403  * it will return a default estimate.
404  */
405 static double
406 scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
407                           VariableStatData *vardata, Datum constval, Oid consttype)
408 {
409         Form_pg_statistic stats;
410         FmgrInfo        opproc;
411         double          mcv_selec,
412                                 hist_selec,
413                                 sumcommon;
414         double          selec;
415
416         if (!HeapTupleIsValid(vardata->statsTuple))
417         {
418                 /* no stats available, so default result */
419                 return DEFAULT_INEQ_SEL;
420         }
421         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
422
423         fmgr_info(get_opcode(operator), &opproc);
424
425         /*
426          * If we have most-common-values info, add up the fractions of the MCV
427          * entries that satisfy MCV OP CONST.  These fractions contribute directly
428          * to the result selectivity.  Also add up the total fraction represented
429          * by MCV entries.
430          */
431         mcv_selec = mcv_selectivity(vardata, &opproc, constval,
432                                                                 &sumcommon);
433
434         /*
435          * If there is a histogram, determine which bin the constant falls in, and
436          * compute the resulting contribution to selectivity.
437          */
438         hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt,
439                                                                                         constval, consttype);
440
441         /*
442          * Now merge the results from the MCV and histogram calculations,
443          * realizing that the histogram covers only the non-null values that are
444          * not listed in MCV.
445          */
446         selec = 1.0 - stats->stanullfrac - sumcommon;
447
448         if (hist_selec > 0.0)
449                 selec *= hist_selec;
450         else
451         {
452                 /*
453                  * If no histogram but there are values not accounted for by MCV,
454                  * arbitrarily assume half of them will match.
455                  */
456                 selec *= 0.5;
457         }
458
459         selec += mcv_selec;
460
461         /* result should be in range, but make sure... */
462         CLAMP_PROBABILITY(selec);
463
464         return selec;
465 }
466
467 /*
468  *      mcv_selectivity                         - Examine the MCV list for scalarineqsel
469  *
470  * Determine the fraction of the variable's MCV population that satisfies
471  * the predicate (VAR OP CONST), as well as the fraction of the total column
472  * population represented by the MCV list.  This code will work for any
473  * boolean-returning predicate operator.
474  *
475  * The function result is the MCV selectivity, and the fraction of the
476  * total population is returned into *sumcommonp.  Zeroes are returned
477  * if there is no MCV list.
478  */
479 static double
480 mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval,
481                                 double *sumcommonp)
482 {
483         double          mcv_selec,
484                                 sumcommon;
485         Datum      *values;
486         int                     nvalues;
487         float4     *numbers;
488         int                     nnumbers;
489         int                     i;
490
491         mcv_selec = 0.0;
492         sumcommon = 0.0;
493
494         /*
495          * If we have most-common-values info, add up the fractions of the MCV
496          * entries that satisfy MCV OP CONST.  Also add up the total fraction
497          * represented by MCV entries.
498          */
499         if (HeapTupleIsValid(vardata->statsTuple) &&
500                 get_attstatsslot(vardata->statsTuple,
501                                                  vardata->atttype, vardata->atttypmod,
502                                                  STATISTIC_KIND_MCV, InvalidOid,
503                                                  &values, &nvalues,
504                                                  &numbers, &nnumbers))
505         {
506                 for (i = 0; i < nvalues; i++)
507                 {
508                         if (DatumGetBool(FunctionCall2(opproc,
509                                                                                    values[i],
510                                                                                    constval)))
511                                 mcv_selec += numbers[i];
512                         sumcommon += numbers[i];
513                 }
514                 free_attstatsslot(vardata->atttype, values, nvalues,
515                                                   numbers, nnumbers);
516         }
517
518         *sumcommonp = sumcommon;
519         return mcv_selec;
520 }
521
522 /*
523  *      ineq_histogram_selectivity      - Examine the histogram for scalarineqsel
524  *
525  * Determine the fraction of the variable's histogram population that
526  * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
527  *
528  * Returns zero if there is no histogram (valid results will always be
529  * greater than zero).
530  *
531  * Note that the result disregards both the most-common-values (if any) and
532  * null entries.  The caller is expected to combine this result with
533  * statistics for those portions of the column population.
534  */
535 static double
536 ineq_histogram_selectivity(VariableStatData *vardata,
537                                                    FmgrInfo *opproc, bool isgt,
538                                                    Datum constval, Oid consttype)
539 {
540         double          hist_selec;
541         Datum      *values;
542         int                     nvalues;
543         int                     i;
544
545         hist_selec = 0.0;
546
547         /*
548          * Someday, VACUUM might store more than one histogram per rel/att,
549          * corresponding to more than one possible sort ordering defined for the
550          * column type.  However, to make that work we will need to figure out
551          * which staop to search for --- it's not necessarily the one we have at
552          * hand!  (For example, we might have a '<=' operator rather than the '<'
553          * operator that will appear in staop.)  For now, assume that whatever
554          * appears in pg_statistic is sorted the same way our operator sorts, or
555          * the reverse way if isgt is TRUE.
556          */
557         if (HeapTupleIsValid(vardata->statsTuple) &&
558                 get_attstatsslot(vardata->statsTuple,
559                                                  vardata->atttype, vardata->atttypmod,
560                                                  STATISTIC_KIND_HISTOGRAM, InvalidOid,
561                                                  &values, &nvalues,
562                                                  NULL, NULL))
563         {
564                 if (nvalues > 1)
565                 {
566                         double          histfrac;
567                         bool            ltcmp;
568
569                         ltcmp = DatumGetBool(FunctionCall2(opproc,
570                                                                                            values[0],
571                                                                                            constval));
572                         if (isgt)
573                                 ltcmp = !ltcmp;
574                         if (!ltcmp)
575                         {
576                                 /* Constant is below lower histogram boundary. */
577                                 histfrac = 0.0;
578                         }
579                         else
580                         {
581                                 /*
582                                  * Scan to find proper location.  This could be made faster by
583                                  * using a binary-search method, but it's probably not worth
584                                  * the trouble for typical histogram sizes.
585                                  */
586                                 for (i = 1; i < nvalues; i++)
587                                 {
588                                         ltcmp = DatumGetBool(FunctionCall2(opproc,
589                                                                                                            values[i],
590                                                                                                            constval));
591                                         if (isgt)
592                                                 ltcmp = !ltcmp;
593                                         if (!ltcmp)
594                                                 break;
595                                 }
596                                 if (i >= nvalues)
597                                 {
598                                         /* Constant is above upper histogram boundary. */
599                                         histfrac = 1.0;
600                                 }
601                                 else
602                                 {
603                                         double          val,
604                                                                 high,
605                                                                 low;
606                                         double          binfrac;
607
608                                         /*
609                                          * We have values[i-1] < constant < values[i].
610                                          *
611                                          * Convert the constant and the two nearest bin boundary
612                                          * values to a uniform comparison scale, and do a linear
613                                          * interpolation within this bin.
614                                          */
615                                         if (convert_to_scalar(constval, consttype, &val,
616                                                                                   values[i - 1], values[i],
617                                                                                   vardata->vartype,
618                                                                                   &low, &high))
619                                         {
620                                                 if (high <= low)
621                                                 {
622                                                         /* cope if bin boundaries appear identical */
623                                                         binfrac = 0.5;
624                                                 }
625                                                 else if (val <= low)
626                                                         binfrac = 0.0;
627                                                 else if (val >= high)
628                                                         binfrac = 1.0;
629                                                 else
630                                                 {
631                                                         binfrac = (val - low) / (high - low);
632
633                                                         /*
634                                                          * Watch out for the possibility that we got a NaN
635                                                          * or Infinity from the division.  This can happen
636                                                          * despite the previous checks, if for example
637                                                          * "low" is -Infinity.
638                                                          */
639                                                         if (isnan(binfrac) ||
640                                                                 binfrac < 0.0 || binfrac > 1.0)
641                                                                 binfrac = 0.5;
642                                                 }
643                                         }
644                                         else
645                                         {
646                                                 /*
647                                                  * Ideally we'd produce an error here, on the grounds
648                                                  * that the given operator shouldn't have scalarXXsel
649                                                  * registered as its selectivity func unless we can
650                                                  * deal with its operand types.  But currently, all
651                                                  * manner of stuff is invoking scalarXXsel, so give a
652                                                  * default estimate until that can be fixed.
653                                                  */
654                                                 binfrac = 0.5;
655                                         }
656
657                                         /*
658                                          * Now, compute the overall selectivity across the values
659                                          * represented by the histogram.  We have i-1 full bins
660                                          * and binfrac partial bin below the constant.
661                                          */
662                                         histfrac = (double) (i - 1) + binfrac;
663                                         histfrac /= (double) (nvalues - 1);
664                                 }
665                         }
666
667                         /*
668                          * Now histfrac = fraction of histogram entries below the
669                          * constant.
670                          *
671                          * Account for "<" vs ">"
672                          */
673                         hist_selec = isgt ? (1.0 - histfrac) : histfrac;
674
675                         /*
676                          * The histogram boundaries are only approximate to begin with,
677                          * and may well be out of date anyway.  Therefore, don't believe
678                          * extremely small or large selectivity estimates.
679                          */
680                         if (hist_selec < 0.0001)
681                                 hist_selec = 0.0001;
682                         else if (hist_selec > 0.9999)
683                                 hist_selec = 0.9999;
684                 }
685
686                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
687         }
688
689         return hist_selec;
690 }
691
692 /*
693  *              scalarltsel             - Selectivity of "<" (also "<=") for scalars.
694  */
695 Datum
696 scalarltsel(PG_FUNCTION_ARGS)
697 {
698         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
699         Oid                     operator = PG_GETARG_OID(1);
700         List       *args = (List *) PG_GETARG_POINTER(2);
701         int                     varRelid = PG_GETARG_INT32(3);
702         VariableStatData vardata;
703         Node       *other;
704         bool            varonleft;
705         Datum           constval;
706         Oid                     consttype;
707         bool            isgt;
708         double          selec;
709
710         /*
711          * If expression is not variable op something or something op variable,
712          * then punt and return a default estimate.
713          */
714         if (!get_restriction_variable(root, args, varRelid,
715                                                                   &vardata, &other, &varonleft))
716                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
717
718         /*
719          * Can't do anything useful if the something is not a constant, either.
720          */
721         if (!IsA(other, Const))
722         {
723                 ReleaseVariableStats(vardata);
724                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
725         }
726
727         /*
728          * If the constant is NULL, assume operator is strict and return zero, ie,
729          * operator will never return TRUE.
730          */
731         if (((Const *) other)->constisnull)
732         {
733                 ReleaseVariableStats(vardata);
734                 PG_RETURN_FLOAT8(0.0);
735         }
736         constval = ((Const *) other)->constvalue;
737         consttype = ((Const *) other)->consttype;
738
739         /*
740          * Force the var to be on the left to simplify logic in scalarineqsel.
741          */
742         if (varonleft)
743         {
744                 /* we have var < other */
745                 isgt = false;
746         }
747         else
748         {
749                 /* we have other < var, commute to make var > other */
750                 operator = get_commutator(operator);
751                 if (!operator)
752                 {
753                         /* Use default selectivity (should we raise an error instead?) */
754                         ReleaseVariableStats(vardata);
755                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
756                 }
757                 isgt = true;
758         }
759
760         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
761
762         ReleaseVariableStats(vardata);
763
764         PG_RETURN_FLOAT8((float8) selec);
765 }
766
767 /*
768  *              scalargtsel             - Selectivity of ">" (also ">=") for integers.
769  */
770 Datum
771 scalargtsel(PG_FUNCTION_ARGS)
772 {
773         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
774         Oid                     operator = PG_GETARG_OID(1);
775         List       *args = (List *) PG_GETARG_POINTER(2);
776         int                     varRelid = PG_GETARG_INT32(3);
777         VariableStatData vardata;
778         Node       *other;
779         bool            varonleft;
780         Datum           constval;
781         Oid                     consttype;
782         bool            isgt;
783         double          selec;
784
785         /*
786          * If expression is not variable op something or something op variable,
787          * then punt and return a default estimate.
788          */
789         if (!get_restriction_variable(root, args, varRelid,
790                                                                   &vardata, &other, &varonleft))
791                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
792
793         /*
794          * Can't do anything useful if the something is not a constant, either.
795          */
796         if (!IsA(other, Const))
797         {
798                 ReleaseVariableStats(vardata);
799                 PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
800         }
801
802         /*
803          * If the constant is NULL, assume operator is strict and return zero, ie,
804          * operator will never return TRUE.
805          */
806         if (((Const *) other)->constisnull)
807         {
808                 ReleaseVariableStats(vardata);
809                 PG_RETURN_FLOAT8(0.0);
810         }
811         constval = ((Const *) other)->constvalue;
812         consttype = ((Const *) other)->consttype;
813
814         /*
815          * Force the var to be on the left to simplify logic in scalarineqsel.
816          */
817         if (varonleft)
818         {
819                 /* we have var > other */
820                 isgt = true;
821         }
822         else
823         {
824                 /* we have other > var, commute to make var < other */
825                 operator = get_commutator(operator);
826                 if (!operator)
827                 {
828                         /* Use default selectivity (should we raise an error instead?) */
829                         ReleaseVariableStats(vardata);
830                         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
831                 }
832                 isgt = false;
833         }
834
835         selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
836
837         ReleaseVariableStats(vardata);
838
839         PG_RETURN_FLOAT8((float8) selec);
840 }
841
842 /*
843  * patternsel                   - Generic code for pattern-match selectivity.
844  */
845 static double
846 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
847 {
848         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
849         Oid                     operator = PG_GETARG_OID(1);
850         List       *args = (List *) PG_GETARG_POINTER(2);
851         int                     varRelid = PG_GETARG_INT32(3);
852         VariableStatData vardata;
853         Node       *variable;
854         Node       *other;
855         bool            varonleft;
856         Datum           constval;
857         Oid                     consttype;
858         Oid                     vartype;
859         Oid                     opclass;
860         Pattern_Prefix_Status pstatus;
861         Const      *patt = NULL;
862         Const      *prefix = NULL;
863         Const      *rest = NULL;
864         double          result;
865
866         /*
867          * If expression is not variable op constant, then punt and return a
868          * default estimate.
869          */
870         if (!get_restriction_variable(root, args, varRelid,
871                                                                   &vardata, &other, &varonleft))
872                 return DEFAULT_MATCH_SEL;
873         if (!varonleft || !IsA(other, Const))
874         {
875                 ReleaseVariableStats(vardata);
876                 return DEFAULT_MATCH_SEL;
877         }
878         variable = (Node *) linitial(args);
879
880         /*
881          * If the constant is NULL, assume operator is strict and return zero, ie,
882          * operator will never return TRUE.
883          */
884         if (((Const *) other)->constisnull)
885         {
886                 ReleaseVariableStats(vardata);
887                 return 0.0;
888         }
889         constval = ((Const *) other)->constvalue;
890         consttype = ((Const *) other)->consttype;
891
892         /*
893          * The right-hand const is type text or bytea for all supported operators.
894          * We do not expect to see binary-compatible types here, since
895          * const-folding should have relabeled the const to exactly match the
896          * operator's declared type.
897          */
898         if (consttype != TEXTOID && consttype != BYTEAOID)
899         {
900                 ReleaseVariableStats(vardata);
901                 return DEFAULT_MATCH_SEL;
902         }
903
904         /*
905          * Similarly, the exposed type of the left-hand side should be one of
906          * those we know.  (Do not look at vardata.atttype, which might be
907          * something binary-compatible but different.)  We can use it to choose
908          * the index opclass from which we must draw the comparison operators.
909          *
910          * NOTE: It would be more correct to use the PATTERN opclasses than the
911          * simple ones, but at the moment ANALYZE will not generate statistics for
912          * the PATTERN operators.  But our results are so approximate anyway that
913          * it probably hardly matters.
914          */
915         vartype = vardata.vartype;
916
917         switch (vartype)
918         {
919                 case TEXTOID:
920                         opclass = TEXT_BTREE_OPS_OID;
921                         break;
922                 case VARCHAROID:
923                         opclass = VARCHAR_BTREE_OPS_OID;
924                         break;
925                 case BPCHAROID:
926                         opclass = BPCHAR_BTREE_OPS_OID;
927                         break;
928                 case NAMEOID:
929                         opclass = NAME_BTREE_OPS_OID;
930                         break;
931                 case BYTEAOID:
932                         opclass = BYTEA_BTREE_OPS_OID;
933                         break;
934                 default:
935                         ReleaseVariableStats(vardata);
936                         return DEFAULT_MATCH_SEL;
937         }
938
939         /* divide pattern into fixed prefix and remainder */
940         patt = (Const *) other;
941         pstatus = pattern_fixed_prefix(patt, ptype, &prefix, &rest);
942
943         /*
944          * If necessary, coerce the prefix constant to the right type. (The "rest"
945          * constant need not be changed.)
946          */
947         if (prefix && prefix->consttype != vartype)
948         {
949                 char       *prefixstr;
950
951                 switch (prefix->consttype)
952                 {
953                         case TEXTOID:
954                                 prefixstr = DatumGetCString(DirectFunctionCall1(textout,
955                                                                                                                 prefix->constvalue));
956                                 break;
957                         case BYTEAOID:
958                                 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
959                                                                                                                 prefix->constvalue));
960                                 break;
961                         default:
962                                 elog(ERROR, "unrecognized consttype: %u",
963                                          prefix->consttype);
964                                 ReleaseVariableStats(vardata);
965                                 return DEFAULT_MATCH_SEL;
966                 }
967                 prefix = string_to_const(prefixstr, vartype);
968                 pfree(prefixstr);
969         }
970
971         if (pstatus == Pattern_Prefix_Exact)
972         {
973                 /*
974                  * Pattern specifies an exact match, so pretend operator is '='
975                  */
976                 Oid                     eqopr = get_opclass_member(opclass, InvalidOid,
977                                                                                            BTEqualStrategyNumber);
978                 List       *eqargs;
979
980                 if (eqopr == InvalidOid)
981                         elog(ERROR, "no = operator for opclass %u", opclass);
982                 eqargs = list_make2(variable, prefix);
983                 result = DatumGetFloat8(DirectFunctionCall4(eqsel,
984                                                                                                         PointerGetDatum(root),
985                                                                                                         ObjectIdGetDatum(eqopr),
986                                                                                                         PointerGetDatum(eqargs),
987                                                                                                         Int32GetDatum(varRelid)));
988         }
989         else
990         {
991                 /*
992                  * Not exact-match pattern.  We estimate selectivity of the fixed
993                  * prefix and remainder of pattern separately, then combine the two
994                  * to get an estimate of the selectivity for the part of the column
995                  * population represented by the histogram.  We then add up data for
996                  * any most-common-values values; these are not in the histogram
997                  * population, and we can get exact answers for them by applying
998                  * the pattern operator, so there's no reason to approximate.
999                  * (If the MCVs cover a significant part of the total population,
1000                  * this gives us a big leg up in accuracy.)
1001                  */
1002                 Selectivity prefixsel;
1003                 Selectivity restsel;
1004                 Selectivity selec;
1005                 FmgrInfo        opproc;
1006                 double          nullfrac,
1007                                         mcv_selec,
1008                                         sumcommon;
1009
1010                 if (HeapTupleIsValid(vardata.statsTuple))
1011                         nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
1012                 else
1013                         nullfrac = 0.0;
1014
1015                 if (pstatus == Pattern_Prefix_Partial)
1016                         prefixsel = prefix_selectivity(&vardata, opclass, prefix);
1017                 else
1018                         prefixsel = 1.0;
1019                 restsel = pattern_selectivity(rest, ptype);
1020                 selec = prefixsel * restsel;
1021
1022                 /*
1023                  * If we have most-common-values info, add up the fractions of the MCV
1024                  * entries that satisfy MCV OP PATTERN.  These fractions contribute
1025                  * directly to the result selectivity.  Also add up the total fraction
1026                  * represented by MCV entries.
1027                  */
1028                 fmgr_info(get_opcode(operator), &opproc);
1029                 mcv_selec = mcv_selectivity(&vardata, &opproc, constval,
1030                                                                         &sumcommon);
1031
1032                 /*
1033                  * Now merge the results from the MCV and histogram calculations,
1034                  * realizing that the histogram covers only the non-null values that
1035                  * are not listed in MCV.
1036                  */
1037                 selec *= 1.0 - nullfrac - sumcommon;
1038                 selec += mcv_selec;
1039
1040                 /* result should be in range, but make sure... */
1041                 CLAMP_PROBABILITY(selec);
1042                 result = selec;
1043         }
1044
1045         if (prefix)
1046         {
1047                 pfree(DatumGetPointer(prefix->constvalue));
1048                 pfree(prefix);
1049         }
1050
1051         ReleaseVariableStats(vardata);
1052
1053         return result;
1054 }
1055
1056 /*
1057  *              regexeqsel              - Selectivity of regular-expression pattern match.
1058  */
1059 Datum
1060 regexeqsel(PG_FUNCTION_ARGS)
1061 {
1062         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex));
1063 }
1064
1065 /*
1066  *              icregexeqsel    - Selectivity of case-insensitive regex match.
1067  */
1068 Datum
1069 icregexeqsel(PG_FUNCTION_ARGS)
1070 {
1071         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC));
1072 }
1073
1074 /*
1075  *              likesel                 - Selectivity of LIKE pattern match.
1076  */
1077 Datum
1078 likesel(PG_FUNCTION_ARGS)
1079 {
1080         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like));
1081 }
1082
1083 /*
1084  *              iclikesel                       - Selectivity of ILIKE pattern match.
1085  */
1086 Datum
1087 iclikesel(PG_FUNCTION_ARGS)
1088 {
1089         PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC));
1090 }
1091
1092 /*
1093  *              regexnesel              - Selectivity of regular-expression pattern non-match.
1094  */
1095 Datum
1096 regexnesel(PG_FUNCTION_ARGS)
1097 {
1098         double          result;
1099
1100         result = patternsel(fcinfo, Pattern_Type_Regex);
1101         result = 1.0 - result;
1102         PG_RETURN_FLOAT8(result);
1103 }
1104
1105 /*
1106  *              icregexnesel    - Selectivity of case-insensitive regex non-match.
1107  */
1108 Datum
1109 icregexnesel(PG_FUNCTION_ARGS)
1110 {
1111         double          result;
1112
1113         result = patternsel(fcinfo, Pattern_Type_Regex_IC);
1114         result = 1.0 - result;
1115         PG_RETURN_FLOAT8(result);
1116 }
1117
1118 /*
1119  *              nlikesel                - Selectivity of LIKE pattern non-match.
1120  */
1121 Datum
1122 nlikesel(PG_FUNCTION_ARGS)
1123 {
1124         double          result;
1125
1126         result = patternsel(fcinfo, Pattern_Type_Like);
1127         result = 1.0 - result;
1128         PG_RETURN_FLOAT8(result);
1129 }
1130
1131 /*
1132  *              icnlikesel              - Selectivity of ILIKE pattern non-match.
1133  */
1134 Datum
1135 icnlikesel(PG_FUNCTION_ARGS)
1136 {
1137         double          result;
1138
1139         result = patternsel(fcinfo, Pattern_Type_Like_IC);
1140         result = 1.0 - result;
1141         PG_RETURN_FLOAT8(result);
1142 }
1143
1144 /*
1145  *              booltestsel             - Selectivity of BooleanTest Node.
1146  */
1147 Selectivity
1148 booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
1149                         int varRelid, JoinType jointype)
1150 {
1151         VariableStatData vardata;
1152         double          selec;
1153
1154         examine_variable(root, arg, varRelid, &vardata);
1155
1156         if (HeapTupleIsValid(vardata.statsTuple))
1157         {
1158                 Form_pg_statistic stats;
1159                 double          freq_null;
1160                 Datum      *values;
1161                 int                     nvalues;
1162                 float4     *numbers;
1163                 int                     nnumbers;
1164
1165                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1166                 freq_null = stats->stanullfrac;
1167
1168                 if (get_attstatsslot(vardata.statsTuple,
1169                                                          vardata.atttype, vardata.atttypmod,
1170                                                          STATISTIC_KIND_MCV, InvalidOid,
1171                                                          &values, &nvalues,
1172                                                          &numbers, &nnumbers)
1173                         && nnumbers > 0)
1174                 {
1175                         double          freq_true;
1176                         double          freq_false;
1177
1178                         /*
1179                          * Get first MCV frequency and derive frequency for true.
1180                          */
1181                         if (DatumGetBool(values[0]))
1182                                 freq_true = numbers[0];
1183                         else
1184                                 freq_true = 1.0 - numbers[0] - freq_null;
1185
1186                         /*
1187                          * Next derive frequency for false. Then use these as appropriate
1188                          * to derive frequency for each case.
1189                          */
1190                         freq_false = 1.0 - freq_true - freq_null;
1191
1192                         switch (booltesttype)
1193                         {
1194                                 case IS_UNKNOWN:
1195                                         /* select only NULL values */
1196                                         selec = freq_null;
1197                                         break;
1198                                 case IS_NOT_UNKNOWN:
1199                                         /* select non-NULL values */
1200                                         selec = 1.0 - freq_null;
1201                                         break;
1202                                 case IS_TRUE:
1203                                         /* select only TRUE values */
1204                                         selec = freq_true;
1205                                         break;
1206                                 case IS_NOT_TRUE:
1207                                         /* select non-TRUE values */
1208                                         selec = 1.0 - freq_true;
1209                                         break;
1210                                 case IS_FALSE:
1211                                         /* select only FALSE values */
1212                                         selec = freq_false;
1213                                         break;
1214                                 case IS_NOT_FALSE:
1215                                         /* select non-FALSE values */
1216                                         selec = 1.0 - freq_false;
1217                                         break;
1218                                 default:
1219                                         elog(ERROR, "unrecognized booltesttype: %d",
1220                                                  (int) booltesttype);
1221                                         selec = 0.0;    /* Keep compiler quiet */
1222                                         break;
1223                         }
1224
1225                         free_attstatsslot(vardata.atttype, values, nvalues,
1226                                                           numbers, nnumbers);
1227                 }
1228                 else
1229                 {
1230                         /*
1231                          * No most-common-value info available. Still have null fraction
1232                          * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
1233                          * for null fraction and assume an even split for boolean tests.
1234                          */
1235                         switch (booltesttype)
1236                         {
1237                                 case IS_UNKNOWN:
1238
1239                                         /*
1240                                          * Use freq_null directly.
1241                                          */
1242                                         selec = freq_null;
1243                                         break;
1244                                 case IS_NOT_UNKNOWN:
1245
1246                                         /*
1247                                          * Select not unknown (not null) values. Calculate from
1248                                          * freq_null.
1249                                          */
1250                                         selec = 1.0 - freq_null;
1251                                         break;
1252                                 case IS_TRUE:
1253                                 case IS_NOT_TRUE:
1254                                 case IS_FALSE:
1255                                 case IS_NOT_FALSE:
1256                                         selec = (1.0 - freq_null) / 2.0;
1257                                         break;
1258                                 default:
1259                                         elog(ERROR, "unrecognized booltesttype: %d",
1260                                                  (int) booltesttype);
1261                                         selec = 0.0;    /* Keep compiler quiet */
1262                                         break;
1263                         }
1264                 }
1265         }
1266         else
1267         {
1268                 /*
1269                  * If we can't get variable statistics for the argument, perhaps
1270                  * clause_selectivity can do something with it.  We ignore the
1271                  * possibility of a NULL value when using clause_selectivity, and just
1272                  * assume the value is either TRUE or FALSE.
1273                  */
1274                 switch (booltesttype)
1275                 {
1276                         case IS_UNKNOWN:
1277                                 selec = DEFAULT_UNK_SEL;
1278                                 break;
1279                         case IS_NOT_UNKNOWN:
1280                                 selec = DEFAULT_NOT_UNK_SEL;
1281                                 break;
1282                         case IS_TRUE:
1283                         case IS_NOT_FALSE:
1284                                 selec = (double) clause_selectivity(root, arg,
1285                                                                                                         varRelid, jointype);
1286                                 break;
1287                         case IS_FALSE:
1288                         case IS_NOT_TRUE:
1289                                 selec = 1.0 - (double) clause_selectivity(root, arg,
1290                                                                                                                   varRelid, jointype);
1291                                 break;
1292                         default:
1293                                 elog(ERROR, "unrecognized booltesttype: %d",
1294                                          (int) booltesttype);
1295                                 selec = 0.0;    /* Keep compiler quiet */
1296                                 break;
1297                 }
1298         }
1299
1300         ReleaseVariableStats(vardata);
1301
1302         /* result should be in range, but make sure... */
1303         CLAMP_PROBABILITY(selec);
1304
1305         return (Selectivity) selec;
1306 }
1307
1308 /*
1309  *              nulltestsel             - Selectivity of NullTest Node.
1310  */
1311 Selectivity
1312 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
1313                         Node *arg, int varRelid)
1314 {
1315         VariableStatData vardata;
1316         double          selec;
1317
1318         examine_variable(root, arg, varRelid, &vardata);
1319
1320         if (HeapTupleIsValid(vardata.statsTuple))
1321         {
1322                 Form_pg_statistic stats;
1323                 double          freq_null;
1324
1325                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
1326                 freq_null = stats->stanullfrac;
1327
1328                 switch (nulltesttype)
1329                 {
1330                         case IS_NULL:
1331
1332                                 /*
1333                                  * Use freq_null directly.
1334                                  */
1335                                 selec = freq_null;
1336                                 break;
1337                         case IS_NOT_NULL:
1338
1339                                 /*
1340                                  * Select not unknown (not null) values. Calculate from
1341                                  * freq_null.
1342                                  */
1343                                 selec = 1.0 - freq_null;
1344                                 break;
1345                         default:
1346                                 elog(ERROR, "unrecognized nulltesttype: %d",
1347                                          (int) nulltesttype);
1348                                 return (Selectivity) 0; /* keep compiler quiet */
1349                 }
1350         }
1351         else
1352         {
1353                 /*
1354                  * No VACUUM ANALYZE stats available, so make a guess
1355                  */
1356                 switch (nulltesttype)
1357                 {
1358                         case IS_NULL:
1359                                 selec = DEFAULT_UNK_SEL;
1360                                 break;
1361                         case IS_NOT_NULL:
1362                                 selec = DEFAULT_NOT_UNK_SEL;
1363                                 break;
1364                         default:
1365                                 elog(ERROR, "unrecognized nulltesttype: %d",
1366                                          (int) nulltesttype);
1367                                 return (Selectivity) 0; /* keep compiler quiet */
1368                 }
1369         }
1370
1371         ReleaseVariableStats(vardata);
1372
1373         /* result should be in range, but make sure... */
1374         CLAMP_PROBABILITY(selec);
1375
1376         return (Selectivity) selec;
1377 }
1378
1379 /*
1380  *              scalararraysel          - Selectivity of ScalarArrayOpExpr Node.
1381  */
1382 Selectivity
1383 scalararraysel(PlannerInfo *root,
1384                            ScalarArrayOpExpr *clause,
1385                            bool is_join_clause,
1386                            int varRelid, JoinType jointype)
1387 {
1388         Oid                     operator = clause->opno;
1389         bool            useOr = clause->useOr;
1390         Node       *leftop;
1391         Node       *rightop;
1392         RegProcedure oprsel;
1393         FmgrInfo        oprselproc;
1394         Datum           selarg4;
1395         Selectivity     s1;
1396
1397         /*
1398          * First, look up the underlying operator's selectivity estimator.
1399          * Punt if it hasn't got one.
1400          */
1401         if (is_join_clause)
1402         {
1403                 oprsel = get_oprjoin(operator);
1404                 selarg4 = Int16GetDatum(jointype);
1405
1406         }
1407         else
1408         {
1409                 oprsel = get_oprrest(operator);
1410                 selarg4 = Int32GetDatum(varRelid);
1411         }
1412         if (!oprsel)
1413                 return (Selectivity) 0.5;
1414         fmgr_info(oprsel, &oprselproc);
1415
1416         /*
1417          * We consider three cases:
1418          *
1419          * 1. rightop is an Array constant: deconstruct the array, apply the
1420          * operator's selectivity function for each array element, and merge
1421          * the results in the same way that clausesel.c does for AND/OR
1422          * combinations.
1423          *
1424          * 2. rightop is an ARRAY[] construct: apply the operator's selectivity
1425          * function for each element of the ARRAY[] construct, and merge.
1426          *
1427          * 3. otherwise, make a guess ...
1428          */
1429         Assert(list_length(clause->args) == 2);
1430         leftop = (Node *) linitial(clause->args);
1431         rightop = (Node *) lsecond(clause->args);
1432
1433         if (rightop && IsA(rightop, Const))
1434         {
1435                 Datum           arraydatum = ((Const *) rightop)->constvalue;
1436                 bool            arrayisnull = ((Const *) rightop)->constisnull;
1437                 ArrayType  *arrayval;
1438                 int16           elmlen;
1439                 bool            elmbyval;
1440                 char            elmalign;
1441                 int                     num_elems;
1442                 Datum      *elem_values;
1443                 bool       *elem_nulls;
1444                 int                     i;
1445
1446                 if (arrayisnull)                /* qual can't succeed if null array */
1447                         return (Selectivity) 0.0;
1448                 arrayval = DatumGetArrayTypeP(arraydatum);
1449                 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
1450                                                          &elmlen, &elmbyval, &elmalign);
1451                 deconstruct_array(arrayval,
1452                                                   ARR_ELEMTYPE(arrayval),
1453                                                   elmlen, elmbyval, elmalign,
1454                                                   &elem_values, &elem_nulls, &num_elems);
1455                 s1 = useOr ? 0.0 : 1.0;
1456                 for (i = 0; i < num_elems; i++)
1457                 {
1458                         List    *args;
1459                         Selectivity s2;
1460
1461                         args = list_make2(leftop,
1462                                                           makeConst(ARR_ELEMTYPE(arrayval),
1463                                                                                 elmlen,
1464                                                                                 elem_values[i],
1465                                                                                 elem_nulls[i],
1466                                                                                 elmbyval));
1467                         s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1468                                                                                           PointerGetDatum(root),
1469                                                                                           ObjectIdGetDatum(operator),
1470                                                                                           PointerGetDatum(args),
1471                                                                                           selarg4));
1472                         if (useOr)
1473                                 s1 = s1 + s2 - s1 * s2;
1474                         else
1475                                 s1 = s1 * s2;
1476                 }
1477         }
1478         else if (rightop && IsA(rightop, ArrayExpr) &&
1479                          !((ArrayExpr *) rightop)->multidims)
1480         {
1481                 ArrayExpr  *arrayexpr = (ArrayExpr *) rightop;
1482                 int16           elmlen;
1483                 bool            elmbyval;
1484                 ListCell   *l;
1485
1486                 get_typlenbyval(arrayexpr->element_typeid,
1487                                                 &elmlen, &elmbyval);
1488                 s1 = useOr ? 0.0 : 1.0;
1489                 foreach(l, arrayexpr->elements)
1490                 {
1491                         List    *args;
1492                         Selectivity s2;
1493
1494                         args = list_make2(leftop, lfirst(l));
1495                         s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1496                                                                                           PointerGetDatum(root),
1497                                                                                           ObjectIdGetDatum(operator),
1498                                                                                           PointerGetDatum(args),
1499                                                                                           selarg4));
1500                         if (useOr)
1501                                 s1 = s1 + s2 - s1 * s2;
1502                         else
1503                                 s1 = s1 * s2;
1504                 }
1505         }
1506         else
1507         {
1508                 CaseTestExpr *dummyexpr;
1509                 List    *args;
1510                 Selectivity s2;
1511                 int             i;
1512
1513                 /*
1514                  * We need a dummy rightop to pass to the operator selectivity
1515                  * routine.  It can be pretty much anything that doesn't look like
1516                  * a constant; CaseTestExpr is a convenient choice.
1517                  */
1518                 dummyexpr = makeNode(CaseTestExpr);
1519                 dummyexpr->typeId = get_element_type(exprType(rightop));
1520                 dummyexpr->typeMod = -1;
1521                 args = list_make2(leftop, dummyexpr);
1522                 s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
1523                                                                                   PointerGetDatum(root),
1524                                                                                   ObjectIdGetDatum(operator),
1525                                                                                   PointerGetDatum(args),
1526                                                                                   selarg4));
1527                 s1 = useOr ? 0.0 : 1.0;
1528                 /*
1529                  * Arbitrarily assume 10 elements in the eventual array value
1530                  */
1531                 for (i = 0; i < 10; i++)
1532                 {
1533                         if (useOr)
1534                                 s1 = s1 + s2 - s1 * s2;
1535                         else
1536                                 s1 = s1 * s2;
1537                 }
1538         }
1539
1540         /* result should be in range, but make sure... */
1541         CLAMP_PROBABILITY(s1);
1542
1543         return s1;
1544 }
1545
1546 /*
1547  *              rowcomparesel           - Selectivity of RowCompareExpr Node.
1548  *
1549  * We estimate RowCompare selectivity by considering just the first (high
1550  * order) columns, which makes it equivalent to an ordinary OpExpr.  While
1551  * this estimate could be refined by considering additional columns, it
1552  * seems unlikely that we could do a lot better without multi-column
1553  * statistics.
1554  */
1555 Selectivity
1556 rowcomparesel(PlannerInfo *root,
1557                           RowCompareExpr *clause,
1558                           int varRelid, JoinType jointype)
1559 {
1560         Selectivity s1;
1561         Oid                     opno = linitial_oid(clause->opnos);
1562         List       *opargs;
1563         bool            is_join_clause;
1564
1565         /* Build equivalent arg list for single operator */
1566         opargs = list_make2(linitial(clause->largs), linitial(clause->rargs));
1567
1568         /* Decide if it's a join clause, same as for OpExpr */
1569         if (varRelid != 0)
1570         {
1571                 /*
1572                  * If we are considering a nestloop join then all clauses are
1573                  * restriction clauses, since we are only interested in the one
1574                  * relation.
1575                  */
1576                 is_join_clause = false;
1577         }
1578         else
1579         {
1580                 /*
1581                  * Otherwise, it's a join if there's more than one relation used.
1582                  * Notice we ignore the low-order columns here.
1583                  */
1584                 is_join_clause = (NumRelids((Node *) opargs) > 1);
1585         }
1586
1587         if (is_join_clause)
1588         {
1589                 /* Estimate selectivity for a join clause. */
1590                 s1 = join_selectivity(root, opno,
1591                                                           opargs,
1592                                                           jointype);
1593         }
1594         else
1595         {
1596                 /* Estimate selectivity for a restriction clause. */
1597                 s1 = restriction_selectivity(root, opno,
1598                                                                          opargs,
1599                                                                          varRelid);
1600         }
1601
1602         return s1;
1603 }
1604
1605 /*
1606  *              eqjoinsel               - Join selectivity of "="
1607  */
1608 Datum
1609 eqjoinsel(PG_FUNCTION_ARGS)
1610 {
1611         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1612         Oid                     operator = PG_GETARG_OID(1);
1613         List       *args = (List *) PG_GETARG_POINTER(2);
1614         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1615         double          selec;
1616         VariableStatData vardata1;
1617         VariableStatData vardata2;
1618         double          nd1;
1619         double          nd2;
1620         Form_pg_statistic stats1 = NULL;
1621         Form_pg_statistic stats2 = NULL;
1622         bool            have_mcvs1 = false;
1623         Datum      *values1 = NULL;
1624         int                     nvalues1 = 0;
1625         float4     *numbers1 = NULL;
1626         int                     nnumbers1 = 0;
1627         bool            have_mcvs2 = false;
1628         Datum      *values2 = NULL;
1629         int                     nvalues2 = 0;
1630         float4     *numbers2 = NULL;
1631         int                     nnumbers2 = 0;
1632
1633         get_join_variables(root, args, &vardata1, &vardata2);
1634
1635         nd1 = get_variable_numdistinct(&vardata1);
1636         nd2 = get_variable_numdistinct(&vardata2);
1637
1638         if (HeapTupleIsValid(vardata1.statsTuple))
1639         {
1640                 stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
1641                 have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
1642                                                                           vardata1.atttype,
1643                                                                           vardata1.atttypmod,
1644                                                                           STATISTIC_KIND_MCV,
1645                                                                           InvalidOid,
1646                                                                           &values1, &nvalues1,
1647                                                                           &numbers1, &nnumbers1);
1648         }
1649
1650         if (HeapTupleIsValid(vardata2.statsTuple))
1651         {
1652                 stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
1653                 have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
1654                                                                           vardata2.atttype,
1655                                                                           vardata2.atttypmod,
1656                                                                           STATISTIC_KIND_MCV,
1657                                                                           InvalidOid,
1658                                                                           &values2, &nvalues2,
1659                                                                           &numbers2, &nnumbers2);
1660         }
1661
1662         if (have_mcvs1 && have_mcvs2)
1663         {
1664                 /*
1665                  * We have most-common-value lists for both relations.  Run through
1666                  * the lists to see which MCVs actually join to each other with the
1667                  * given operator.      This allows us to determine the exact join
1668                  * selectivity for the portion of the relations represented by the MCV
1669                  * lists.  We still have to estimate for the remaining population, but
1670                  * in a skewed distribution this gives us a big leg up in accuracy.
1671                  * For motivation see the analysis in Y. Ioannidis and S.
1672                  * Christodoulakis, "On the propagation of errors in the size of join
1673                  * results", Technical Report 1018, Computer Science Dept., University
1674                  * of Wisconsin, Madison, March 1991 (available from ftp.cs.wisc.edu).
1675                  */
1676                 FmgrInfo        eqproc;
1677                 bool       *hasmatch1;
1678                 bool       *hasmatch2;
1679                 double          nullfrac1 = stats1->stanullfrac;
1680                 double          nullfrac2 = stats2->stanullfrac;
1681                 double          matchprodfreq,
1682                                         matchfreq1,
1683                                         matchfreq2,
1684                                         unmatchfreq1,
1685                                         unmatchfreq2,
1686                                         otherfreq1,
1687                                         otherfreq2,
1688                                         totalsel1,
1689                                         totalsel2;
1690                 int                     i,
1691                                         nmatches;
1692
1693                 fmgr_info(get_opcode(operator), &eqproc);
1694                 hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
1695                 hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
1696
1697                 /*
1698                  * If we are doing any variant of JOIN_IN, pretend all the values of
1699                  * the righthand relation are unique (ie, act as if it's been
1700                  * DISTINCT'd).
1701                  *
1702                  * NOTE: it might seem that we should unique-ify the lefthand input
1703                  * when considering JOIN_REVERSE_IN.  But this is not so, because the
1704                  * join clause we've been handed has not been commuted from the way
1705                  * the parser originally wrote it.      We know that the unique side of
1706                  * the IN clause is *always* on the right.
1707                  *
1708                  * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
1709                  * JOIN_RIGHT here, because we do not have enough information to
1710                  * determine which var is really on which side of the join. Perhaps
1711                  * someday we should pass in more information.
1712                  */
1713                 if (jointype == JOIN_IN ||
1714                         jointype == JOIN_REVERSE_IN ||
1715                         jointype == JOIN_UNIQUE_INNER ||
1716                         jointype == JOIN_UNIQUE_OUTER)
1717                 {
1718                         float4          oneovern = 1.0 / nd2;
1719
1720                         for (i = 0; i < nvalues2; i++)
1721                                 numbers2[i] = oneovern;
1722                         nullfrac2 = oneovern;
1723                 }
1724
1725                 /*
1726                  * Note we assume that each MCV will match at most one member of the
1727                  * other MCV list.      If the operator isn't really equality, there could
1728                  * be multiple matches --- but we don't look for them, both for speed
1729                  * and because the math wouldn't add up...
1730                  */
1731                 matchprodfreq = 0.0;
1732                 nmatches = 0;
1733                 for (i = 0; i < nvalues1; i++)
1734                 {
1735                         int                     j;
1736
1737                         for (j = 0; j < nvalues2; j++)
1738                         {
1739                                 if (hasmatch2[j])
1740                                         continue;
1741                                 if (DatumGetBool(FunctionCall2(&eqproc,
1742                                                                                            values1[i],
1743                                                                                            values2[j])))
1744                                 {
1745                                         hasmatch1[i] = hasmatch2[j] = true;
1746                                         matchprodfreq += numbers1[i] * numbers2[j];
1747                                         nmatches++;
1748                                         break;
1749                                 }
1750                         }
1751                 }
1752                 CLAMP_PROBABILITY(matchprodfreq);
1753                 /* Sum up frequencies of matched and unmatched MCVs */
1754                 matchfreq1 = unmatchfreq1 = 0.0;
1755                 for (i = 0; i < nvalues1; i++)
1756                 {
1757                         if (hasmatch1[i])
1758                                 matchfreq1 += numbers1[i];
1759                         else
1760                                 unmatchfreq1 += numbers1[i];
1761                 }
1762                 CLAMP_PROBABILITY(matchfreq1);
1763                 CLAMP_PROBABILITY(unmatchfreq1);
1764                 matchfreq2 = unmatchfreq2 = 0.0;
1765                 for (i = 0; i < nvalues2; i++)
1766                 {
1767                         if (hasmatch2[i])
1768                                 matchfreq2 += numbers2[i];
1769                         else
1770                                 unmatchfreq2 += numbers2[i];
1771                 }
1772                 CLAMP_PROBABILITY(matchfreq2);
1773                 CLAMP_PROBABILITY(unmatchfreq2);
1774                 pfree(hasmatch1);
1775                 pfree(hasmatch2);
1776
1777                 /*
1778                  * Compute total frequency of non-null values that are not in the MCV
1779                  * lists.
1780                  */
1781                 otherfreq1 = 1.0 - nullfrac1 - matchfreq1 - unmatchfreq1;
1782                 otherfreq2 = 1.0 - nullfrac2 - matchfreq2 - unmatchfreq2;
1783                 CLAMP_PROBABILITY(otherfreq1);
1784                 CLAMP_PROBABILITY(otherfreq2);
1785
1786                 /*
1787                  * We can estimate the total selectivity from the point of view of
1788                  * relation 1 as: the known selectivity for matched MCVs, plus
1789                  * unmatched MCVs that are assumed to match against random members of
1790                  * relation 2's non-MCV population, plus non-MCV values that are
1791                  * assumed to match against random members of relation 2's unmatched
1792                  * MCVs plus non-MCV values.
1793                  */
1794                 totalsel1 = matchprodfreq;
1795                 if (nd2 > nvalues2)
1796                         totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1797                 if (nd2 > nmatches)
1798                         totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1799                                 (nd2 - nmatches);
1800                 /* Same estimate from the point of view of relation 2. */
1801                 totalsel2 = matchprodfreq;
1802                 if (nd1 > nvalues1)
1803                         totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1804                 if (nd1 > nmatches)
1805                         totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1806                                 (nd1 - nmatches);
1807
1808                 /*
1809                  * Use the smaller of the two estimates.  This can be justified in
1810                  * essentially the same terms as given below for the no-stats case: to
1811                  * a first approximation, we are estimating from the point of view of
1812                  * the relation with smaller nd.
1813                  */
1814                 selec = (totalsel1 < totalsel2) ? totalsel1 : totalsel2;
1815         }
1816         else
1817         {
1818                 /*
1819                  * We do not have MCV lists for both sides.  Estimate the join
1820                  * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). This
1821                  * is plausible if we assume that the join operator is strict and the
1822                  * non-null values are about equally distributed: a given non-null
1823                  * tuple of rel1 will join to either zero or N2*(1-nullfrac2)/nd2 rows
1824                  * of rel2, so total join rows are at most
1825                  * N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join selectivity of
1826                  * not more than (1-nullfrac1)*(1-nullfrac2)/nd2. By the same logic it
1827                  * is not more than (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression
1828                  * with MIN() is an upper bound.  Using the MIN() means we estimate
1829                  * from the point of view of the relation with smaller nd (since the
1830                  * larger nd is determining the MIN).  It is reasonable to assume that
1831                  * most tuples in this rel will have join partners, so the bound is
1832                  * probably reasonably tight and should be taken as-is.
1833                  *
1834                  * XXX Can we be smarter if we have an MCV list for just one side? It
1835                  * seems that if we assume equal distribution for the other side, we
1836                  * end up with the same answer anyway.
1837                  */
1838                 double          nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
1839                 double          nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
1840
1841                 selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
1842                 if (nd1 > nd2)
1843                         selec /= nd1;
1844                 else
1845                         selec /= nd2;
1846         }
1847
1848         if (have_mcvs1)
1849                 free_attstatsslot(vardata1.atttype, values1, nvalues1,
1850                                                   numbers1, nnumbers1);
1851         if (have_mcvs2)
1852                 free_attstatsslot(vardata2.atttype, values2, nvalues2,
1853                                                   numbers2, nnumbers2);
1854
1855         ReleaseVariableStats(vardata1);
1856         ReleaseVariableStats(vardata2);
1857
1858         CLAMP_PROBABILITY(selec);
1859
1860         PG_RETURN_FLOAT8((float8) selec);
1861 }
1862
1863 /*
1864  *              neqjoinsel              - Join selectivity of "!="
1865  */
1866 Datum
1867 neqjoinsel(PG_FUNCTION_ARGS)
1868 {
1869         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1870         Oid                     operator = PG_GETARG_OID(1);
1871         List       *args = (List *) PG_GETARG_POINTER(2);
1872         JoinType        jointype = (JoinType) PG_GETARG_INT16(3);
1873         Oid                     eqop;
1874         float8          result;
1875
1876         /*
1877          * We want 1 - eqjoinsel() where the equality operator is the one
1878          * associated with this != operator, that is, its negator.
1879          */
1880         eqop = get_negator(operator);
1881         if (eqop)
1882         {
1883                 result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
1884                                                                                                         PointerGetDatum(root),
1885                                                                                                         ObjectIdGetDatum(eqop),
1886                                                                                                         PointerGetDatum(args),
1887                                                                                                         Int16GetDatum(jointype)));
1888         }
1889         else
1890         {
1891                 /* Use default selectivity (should we raise an error instead?) */
1892                 result = DEFAULT_EQ_SEL;
1893         }
1894         result = 1.0 - result;
1895         PG_RETURN_FLOAT8(result);
1896 }
1897
1898 /*
1899  *              scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
1900  */
1901 Datum
1902 scalarltjoinsel(PG_FUNCTION_ARGS)
1903 {
1904         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1905 }
1906
1907 /*
1908  *              scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
1909  */
1910 Datum
1911 scalargtjoinsel(PG_FUNCTION_ARGS)
1912 {
1913         PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
1914 }
1915
1916 /*
1917  *              regexeqjoinsel  - Join selectivity of regular-expression pattern match.
1918  */
1919 Datum
1920 regexeqjoinsel(PG_FUNCTION_ARGS)
1921 {
1922         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1923 }
1924
1925 /*
1926  *              icregexeqjoinsel        - Join selectivity of case-insensitive regex match.
1927  */
1928 Datum
1929 icregexeqjoinsel(PG_FUNCTION_ARGS)
1930 {
1931         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1932 }
1933
1934 /*
1935  *              likejoinsel                     - Join selectivity of LIKE pattern match.
1936  */
1937 Datum
1938 likejoinsel(PG_FUNCTION_ARGS)
1939 {
1940         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1941 }
1942
1943 /*
1944  *              iclikejoinsel                   - Join selectivity of ILIKE pattern match.
1945  */
1946 Datum
1947 iclikejoinsel(PG_FUNCTION_ARGS)
1948 {
1949         PG_RETURN_FLOAT8(DEFAULT_MATCH_SEL);
1950 }
1951
1952 /*
1953  *              regexnejoinsel  - Join selectivity of regex non-match.
1954  */
1955 Datum
1956 regexnejoinsel(PG_FUNCTION_ARGS)
1957 {
1958         float8          result;
1959
1960         result = DatumGetFloat8(regexeqjoinsel(fcinfo));
1961         result = 1.0 - result;
1962         PG_RETURN_FLOAT8(result);
1963 }
1964
1965 /*
1966  *              icregexnejoinsel        - Join selectivity of case-insensitive regex non-match.
1967  */
1968 Datum
1969 icregexnejoinsel(PG_FUNCTION_ARGS)
1970 {
1971         float8          result;
1972
1973         result = DatumGetFloat8(icregexeqjoinsel(fcinfo));
1974         result = 1.0 - result;
1975         PG_RETURN_FLOAT8(result);
1976 }
1977
1978 /*
1979  *              nlikejoinsel            - Join selectivity of LIKE pattern non-match.
1980  */
1981 Datum
1982 nlikejoinsel(PG_FUNCTION_ARGS)
1983 {
1984         float8          result;
1985
1986         result = DatumGetFloat8(likejoinsel(fcinfo));
1987         result = 1.0 - result;
1988         PG_RETURN_FLOAT8(result);
1989 }
1990
1991 /*
1992  *              icnlikejoinsel          - Join selectivity of ILIKE pattern non-match.
1993  */
1994 Datum
1995 icnlikejoinsel(PG_FUNCTION_ARGS)
1996 {
1997         float8          result;
1998
1999         result = DatumGetFloat8(iclikejoinsel(fcinfo));
2000         result = 1.0 - result;
2001         PG_RETURN_FLOAT8(result);
2002 }
2003
2004 /*
2005  * mergejoinscansel                     - Scan selectivity of merge join.
2006  *
2007  * A merge join will stop as soon as it exhausts either input stream.
2008  * Therefore, if we can estimate the ranges of both input variables,
2009  * we can estimate how much of the input will actually be read.  This
2010  * can have a considerable impact on the cost when using indexscans.
2011  *
2012  * clause should be a clause already known to be mergejoinable.
2013  *
2014  * *leftscan is set to the fraction of the left-hand variable expected
2015  * to be scanned (0 to 1), and similarly *rightscan for the right-hand
2016  * variable.
2017  */
2018 void
2019 mergejoinscansel(PlannerInfo *root, Node *clause,
2020                                  Selectivity *leftscan,
2021                                  Selectivity *rightscan)
2022 {
2023         Node       *left,
2024                            *right;
2025         VariableStatData leftvar,
2026                                 rightvar;
2027         Oid                     lefttype,
2028                                 righttype;
2029         Oid                     opno,
2030                                 lsortop,
2031                                 rsortop,
2032                                 ltop,
2033                                 gtop,
2034                                 leop,
2035                                 revgtop,
2036                                 revleop;
2037         Datum           leftmax,
2038                                 rightmax;
2039         double          selec;
2040
2041         /* Set default results if we can't figure anything out. */
2042         *leftscan = *rightscan = 1.0;
2043
2044         /* Deconstruct the merge clause */
2045         if (!is_opclause(clause))
2046                 return;                                 /* shouldn't happen */
2047         opno = ((OpExpr *) clause)->opno;
2048         left = get_leftop((Expr *) clause);
2049         right = get_rightop((Expr *) clause);
2050         if (!right)
2051                 return;                                 /* shouldn't happen */
2052
2053         /* Look for stats for the inputs */
2054         examine_variable(root, left, 0, &leftvar);
2055         examine_variable(root, right, 0, &rightvar);
2056
2057         /* Get the direct input types of the operator */
2058         lefttype = exprType(left);
2059         righttype = exprType(right);
2060
2061         /* Verify mergejoinability and get left and right "<" operators */
2062         if (!op_mergejoinable(opno,
2063                                                   &lsortop,
2064                                                   &rsortop))
2065                 goto fail;                              /* shouldn't happen */
2066
2067         /* Try to get maximum values of both inputs */
2068         if (!get_variable_maximum(root, &leftvar, lsortop, &leftmax))
2069                 goto fail;                              /* no max available from stats */
2070
2071         if (!get_variable_maximum(root, &rightvar, rsortop, &rightmax))
2072                 goto fail;                              /* no max available from stats */
2073
2074         /* Look up the "left < right" and "left > right" operators */
2075         op_mergejoin_crossops(opno, &ltop, &gtop, NULL, NULL);
2076
2077         /* Look up the "left <= right" operator */
2078         leop = get_negator(gtop);
2079         if (!OidIsValid(leop))
2080                 goto fail;                              /* insufficient info in catalogs */
2081
2082         /* Look up the "right > left" operator */
2083         revgtop = get_commutator(ltop);
2084         if (!OidIsValid(revgtop))
2085                 goto fail;                              /* insufficient info in catalogs */
2086
2087         /* Look up the "right <= left" operator */
2088         revleop = get_negator(revgtop);
2089         if (!OidIsValid(revleop))
2090                 goto fail;                              /* insufficient info in catalogs */
2091
2092         /*
2093          * Now, the fraction of the left variable that will be scanned is the
2094          * fraction that's <= the right-side maximum value.  But only believe
2095          * non-default estimates, else stick with our 1.0.
2096          */
2097         selec = scalarineqsel(root, leop, false, &leftvar,
2098                                                   rightmax, righttype);
2099         if (selec != DEFAULT_INEQ_SEL)
2100                 *leftscan = selec;
2101
2102         /* And similarly for the right variable. */
2103         selec = scalarineqsel(root, revleop, false, &rightvar,
2104                                                   leftmax, lefttype);
2105         if (selec != DEFAULT_INEQ_SEL)
2106                 *rightscan = selec;
2107
2108         /*
2109          * Only one of the two fractions can really be less than 1.0; believe the
2110          * smaller estimate and reset the other one to exactly 1.0.  If we get
2111          * exactly equal estimates (as can easily happen with self-joins), believe
2112          * neither.
2113          */
2114         if (*leftscan > *rightscan)
2115                 *leftscan = 1.0;
2116         else if (*leftscan < *rightscan)
2117                 *rightscan = 1.0;
2118         else
2119                 *leftscan = *rightscan = 1.0;
2120
2121 fail:
2122         ReleaseVariableStats(leftvar);
2123         ReleaseVariableStats(rightvar);
2124 }
2125
2126
2127 /*
2128  * Helper routine for estimate_num_groups: add an item to a list of
2129  * GroupVarInfos, but only if it's not known equal to any of the existing
2130  * entries.
2131  */
2132 typedef struct
2133 {
2134         Node       *var;                        /* might be an expression, not just a Var */
2135         RelOptInfo *rel;                        /* relation it belongs to */
2136         double          ndistinct;              /* # distinct values */
2137 } GroupVarInfo;
2138
2139 static List *
2140 add_unique_group_var(PlannerInfo *root, List *varinfos,
2141                                          Node *var, VariableStatData *vardata)
2142 {
2143         GroupVarInfo *varinfo;
2144         double          ndistinct;
2145         ListCell   *lc;
2146
2147         ndistinct = get_variable_numdistinct(vardata);
2148
2149         /* cannot use foreach here because of possible list_delete */
2150         lc = list_head(varinfos);
2151         while (lc)
2152         {
2153                 varinfo = (GroupVarInfo *) lfirst(lc);
2154
2155                 /* must advance lc before list_delete possibly pfree's it */
2156                 lc = lnext(lc);
2157
2158                 /* Drop exact duplicates */
2159                 if (equal(var, varinfo->var))
2160                         return varinfos;
2161
2162                 /*
2163                  * Drop known-equal vars, but only if they belong to different
2164                  * relations (see comments for estimate_num_groups)
2165                  */
2166                 if (vardata->rel != varinfo->rel &&
2167                         exprs_known_equal(root, var, varinfo->var))
2168                 {
2169                         if (varinfo->ndistinct <= ndistinct)
2170                         {
2171                                 /* Keep older item, forget new one */
2172                                 return varinfos;
2173                         }
2174                         else
2175                         {
2176                                 /* Delete the older item */
2177                                 varinfos = list_delete_ptr(varinfos, varinfo);
2178                         }
2179                 }
2180         }
2181
2182         varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
2183
2184         varinfo->var = var;
2185         varinfo->rel = vardata->rel;
2186         varinfo->ndistinct = ndistinct;
2187         varinfos = lappend(varinfos, varinfo);
2188         return varinfos;
2189 }
2190
2191 /*
2192  * estimate_num_groups          - Estimate number of groups in a grouped query
2193  *
2194  * Given a query having a GROUP BY clause, estimate how many groups there
2195  * will be --- ie, the number of distinct combinations of the GROUP BY
2196  * expressions.
2197  *
2198  * This routine is also used to estimate the number of rows emitted by
2199  * a DISTINCT filtering step; that is an isomorphic problem.  (Note:
2200  * actually, we only use it for DISTINCT when there's no grouping or
2201  * aggregation ahead of the DISTINCT.)
2202  *
2203  * Inputs:
2204  *      root - the query
2205  *      groupExprs - list of expressions being grouped by
2206  *      input_rows - number of rows estimated to arrive at the group/unique
2207  *              filter step
2208  *
2209  * Given the lack of any cross-correlation statistics in the system, it's
2210  * impossible to do anything really trustworthy with GROUP BY conditions
2211  * involving multiple Vars.  We should however avoid assuming the worst
2212  * case (all possible cross-product terms actually appear as groups) since
2213  * very often the grouped-by Vars are highly correlated.  Our current approach
2214  * is as follows:
2215  *      1.      Reduce the given expressions to a list of unique Vars used.  For
2216  *              example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
2217  *              It is clearly correct not to count the same Var more than once.
2218  *              It is also reasonable to treat f(x) the same as x: f() cannot
2219  *              increase the number of distinct values (unless it is volatile,
2220  *              which we consider unlikely for grouping), but it probably won't
2221  *              reduce the number of distinct values much either.
2222  *              As a special case, if a GROUP BY expression can be matched to an
2223  *              expressional index for which we have statistics, then we treat the
2224  *              whole expression as though it were just a Var.
2225  *      2.      If the list contains Vars of different relations that are known equal
2226  *              due to equijoin clauses, then drop all but one of the Vars from each
2227  *              known-equal set, keeping the one with smallest estimated # of values
2228  *              (since the extra values of the others can't appear in joined rows).
2229  *              Note the reason we only consider Vars of different relations is that
2230  *              if we considered ones of the same rel, we'd be double-counting the
2231  *              restriction selectivity of the equality in the next step.
2232  *      3.      For Vars within a single source rel, we multiply together the numbers
2233  *              of values, clamp to the number of rows in the rel (divided by 10 if
2234  *              more than one Var), and then multiply by the selectivity of the
2235  *              restriction clauses for that rel.  When there's more than one Var,
2236  *              the initial product is probably too high (it's the worst case) but
2237  *              clamping to a fraction of the rel's rows seems to be a helpful
2238  *              heuristic for not letting the estimate get out of hand.  (The factor
2239  *              of 10 is derived from pre-Postgres-7.4 practice.)  Multiplying
2240  *              by the restriction selectivity is effectively assuming that the
2241  *              restriction clauses are independent of the grouping, which is a crummy
2242  *              assumption, but it's hard to do better.
2243  *      4.      If there are Vars from multiple rels, we repeat step 3 for each such
2244  *              rel, and multiply the results together.
2245  * Note that rels not containing grouped Vars are ignored completely, as are
2246  * join clauses other than the equijoin clauses used in step 2.  Such rels
2247  * cannot increase the number of groups, and we assume such clauses do not
2248  * reduce the number either (somewhat bogus, but we don't have the info to
2249  * do better).
2250  */
2251 double
2252 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows)
2253 {
2254         List       *varinfos = NIL;
2255         double          numdistinct;
2256         ListCell   *l;
2257
2258         /* We should not be called unless query has GROUP BY (or DISTINCT) */
2259         Assert(groupExprs != NIL);
2260
2261         /*
2262          * Steps 1/2: find the unique Vars used, treating an expression as a Var
2263          * if we can find stats for it.  For each one, record the statistical
2264          * estimate of number of distinct values (total in its table, without
2265          * regard for filtering).
2266          */
2267         foreach(l, groupExprs)
2268         {
2269                 Node       *groupexpr = (Node *) lfirst(l);
2270                 VariableStatData vardata;
2271                 List       *varshere;
2272                 ListCell   *l2;
2273
2274                 /*
2275                  * If examine_variable is able to deduce anything about the GROUP BY
2276                  * expression, treat it as a single variable even if it's really more
2277                  * complicated.
2278                  */
2279                 examine_variable(root, groupexpr, 0, &vardata);
2280                 if (vardata.statsTuple != NULL || vardata.isunique)
2281                 {
2282                         varinfos = add_unique_group_var(root, varinfos,
2283                                                                                         groupexpr, &vardata);
2284                         ReleaseVariableStats(vardata);
2285                         continue;
2286                 }
2287                 ReleaseVariableStats(vardata);
2288
2289                 /*
2290                  * Else pull out the component Vars
2291                  */
2292                 varshere = pull_var_clause(groupexpr, false);
2293
2294                 /*
2295                  * If we find any variable-free GROUP BY item, then either it is a
2296                  * constant (and we can ignore it) or it contains a volatile function;
2297                  * in the latter case we punt and assume that each input row will
2298                  * yield a distinct group.
2299                  */
2300                 if (varshere == NIL)
2301                 {
2302                         if (contain_volatile_functions(groupexpr))
2303                                 return input_rows;
2304                         continue;
2305                 }
2306
2307                 /*
2308                  * Else add variables to varinfos list
2309                  */
2310                 foreach(l2, varshere)
2311                 {
2312                         Node       *var = (Node *) lfirst(l2);
2313
2314                         examine_variable(root, var, 0, &vardata);
2315                         varinfos = add_unique_group_var(root, varinfos, var, &vardata);
2316                         ReleaseVariableStats(vardata);
2317                 }
2318         }
2319
2320         /* If now no Vars, we must have an all-constant GROUP BY list. */
2321         if (varinfos == NIL)
2322                 return 1.0;
2323
2324         /*
2325          * Steps 3/4: group Vars by relation and estimate total numdistinct.
2326          *
2327          * For each iteration of the outer loop, we process the frontmost Var in
2328          * varinfos, plus all other Vars in the same relation.  We remove these
2329          * Vars from the newvarinfos list for the next iteration. This is the
2330          * easiest way to group Vars of same rel together.
2331          */
2332         numdistinct = 1.0;
2333
2334         do
2335         {
2336                 GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
2337                 RelOptInfo *rel = varinfo1->rel;
2338                 double          reldistinct = varinfo1->ndistinct;
2339                 double          relmaxndistinct = reldistinct;
2340                 int                     relvarcount = 1;
2341                 List       *newvarinfos = NIL;
2342
2343                 /*
2344                  * Get the product of numdistinct estimates of the Vars for this rel.
2345                  * Also, construct new varinfos list of remaining Vars.
2346                  */
2347                 for_each_cell(l, lnext(list_head(varinfos)))
2348                 {
2349                         GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
2350
2351                         if (varinfo2->rel == varinfo1->rel)
2352                         {
2353                                 reldistinct *= varinfo2->ndistinct;
2354                                 if (relmaxndistinct < varinfo2->ndistinct)
2355                                         relmaxndistinct = varinfo2->ndistinct;
2356                                 relvarcount++;
2357                         }
2358                         else
2359                         {
2360                                 /* not time to process varinfo2 yet */
2361                                 newvarinfos = lcons(varinfo2, newvarinfos);
2362                         }
2363                 }
2364
2365                 /*
2366                  * Sanity check --- don't divide by zero if empty relation.
2367                  */
2368                 Assert(rel->reloptkind == RELOPT_BASEREL);
2369                 if (rel->tuples > 0)
2370                 {
2371                         /*
2372                          * Clamp to size of rel, or size of rel / 10 if multiple Vars. The
2373                          * fudge factor is because the Vars are probably correlated but we
2374                          * don't know by how much.  We should never clamp to less than the
2375                          * largest ndistinct value for any of the Vars, though, since
2376                          * there will surely be at least that many groups.
2377                          */
2378                         double          clamp = rel->tuples;
2379
2380                         if (relvarcount > 1)
2381                         {
2382                                 clamp *= 0.1;
2383                                 if (clamp < relmaxndistinct)
2384                                 {
2385                                         clamp = relmaxndistinct;
2386                                         /* for sanity in case some ndistinct is too large: */
2387                                         if (clamp > rel->tuples)
2388                                                 clamp = rel->tuples;
2389                                 }
2390                         }
2391                         if (reldistinct > clamp)
2392                                 reldistinct = clamp;
2393
2394                         /*
2395                          * Multiply by restriction selectivity.
2396                          */
2397                         reldistinct *= rel->rows / rel->tuples;
2398
2399                         /*
2400                          * Update estimate of total distinct groups.
2401                          */
2402                         numdistinct *= reldistinct;
2403                 }
2404
2405                 varinfos = newvarinfos;
2406         } while (varinfos != NIL);
2407
2408         numdistinct = ceil(numdistinct);
2409
2410         /* Guard against out-of-range answers */
2411         if (numdistinct > input_rows)
2412                 numdistinct = input_rows;
2413         if (numdistinct < 1.0)
2414                 numdistinct = 1.0;
2415
2416         return numdistinct;
2417 }
2418
2419 /*
2420  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
2421  * divided by total tuples in relation) if the specified expression is used
2422  * as a hash key.
2423  *
2424  * XXX This is really pretty bogus since we're effectively assuming that the
2425  * distribution of hash keys will be the same after applying restriction
2426  * clauses as it was in the underlying relation.  However, we are not nearly
2427  * smart enough to figure out how the restrict clauses might change the
2428  * distribution, so this will have to do for now.
2429  *
2430  * We are passed the number of buckets the executor will use for the given
2431  * input relation.      If the data were perfectly distributed, with the same
2432  * number of tuples going into each available bucket, then the bucketsize
2433  * fraction would be 1/nbuckets.  But this happy state of affairs will occur
2434  * only if (a) there are at least nbuckets distinct data values, and (b)
2435  * we have a not-too-skewed data distribution.  Otherwise the buckets will
2436  * be nonuniformly occupied.  If the other relation in the join has a key
2437  * distribution similar to this one's, then the most-loaded buckets are
2438  * exactly those that will be probed most often.  Therefore, the "average"
2439  * bucket size for costing purposes should really be taken as something close
2440  * to the "worst case" bucket size.  We try to estimate this by adjusting the
2441  * fraction if there are too few distinct data values, and then scaling up
2442  * by the ratio of the most common value's frequency to the average frequency.
2443  *
2444  * If no statistics are available, use a default estimate of 0.1.  This will
2445  * discourage use of a hash rather strongly if the inner relation is large,
2446  * which is what we want.  We do not want to hash unless we know that the
2447  * inner rel is well-dispersed (or the alternatives seem much worse).
2448  */
2449 Selectivity
2450 estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
2451 {
2452         VariableStatData vardata;
2453         double          estfract,
2454                                 ndistinct,
2455                                 stanullfrac,
2456                                 mcvfreq,
2457                                 avgfreq;
2458         float4     *numbers;
2459         int                     nnumbers;
2460
2461         examine_variable(root, hashkey, 0, &vardata);
2462
2463         /* Get number of distinct values and fraction that are null */
2464         ndistinct = get_variable_numdistinct(&vardata);
2465
2466         if (HeapTupleIsValid(vardata.statsTuple))
2467         {
2468                 Form_pg_statistic stats;
2469
2470                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
2471                 stanullfrac = stats->stanullfrac;
2472         }
2473         else
2474         {
2475                 /*
2476                  * Believe a default ndistinct only if it came from stats. Otherwise
2477                  * punt and return 0.1, per comments above.
2478                  */
2479                 if (ndistinct == DEFAULT_NUM_DISTINCT)
2480                 {
2481                         ReleaseVariableStats(vardata);
2482                         return (Selectivity) 0.1;
2483                 }
2484
2485                 stanullfrac = 0.0;
2486         }
2487
2488         /* Compute avg freq of all distinct data values in raw relation */
2489         avgfreq = (1.0 - stanullfrac) / ndistinct;
2490
2491         /*
2492          * Adjust ndistinct to account for restriction clauses.  Observe we are
2493          * assuming that the data distribution is affected uniformly by the
2494          * restriction clauses!
2495          *
2496          * XXX Possibly better way, but much more expensive: multiply by
2497          * selectivity of rel's restriction clauses that mention the target Var.
2498          */
2499         if (vardata.rel)
2500                 ndistinct *= vardata.rel->rows / vardata.rel->tuples;
2501
2502         /*
2503          * Initial estimate of bucketsize fraction is 1/nbuckets as long as the
2504          * number of buckets is less than the expected number of distinct values;
2505          * otherwise it is 1/ndistinct.
2506          */
2507         if (ndistinct > nbuckets)
2508                 estfract = 1.0 / nbuckets;
2509         else
2510                 estfract = 1.0 / ndistinct;
2511
2512         /*
2513          * Look up the frequency of the most common value, if available.
2514          */
2515         mcvfreq = 0.0;
2516
2517         if (HeapTupleIsValid(vardata.statsTuple))
2518         {
2519                 if (get_attstatsslot(vardata.statsTuple,
2520                                                          vardata.atttype, vardata.atttypmod,
2521                                                          STATISTIC_KIND_MCV, InvalidOid,
2522                                                          NULL, NULL, &numbers, &nnumbers))
2523                 {
2524                         /*
2525                          * The first MCV stat is for the most common value.
2526                          */
2527                         if (nnumbers > 0)
2528                                 mcvfreq = numbers[0];
2529                         free_attstatsslot(vardata.atttype, NULL, 0,
2530                                                           numbers, nnumbers);
2531                 }
2532         }
2533
2534         /*
2535          * Adjust estimated bucketsize upward to account for skewed distribution.
2536          */
2537         if (avgfreq > 0.0 && mcvfreq > avgfreq)
2538                 estfract *= mcvfreq / avgfreq;
2539
2540         /*
2541          * Clamp bucketsize to sane range (the above adjustment could easily
2542          * produce an out-of-range result).  We set the lower bound a little above
2543          * zero, since zero isn't a very sane result.
2544          */
2545         if (estfract < 1.0e-6)
2546                 estfract = 1.0e-6;
2547         else if (estfract > 1.0)
2548                 estfract = 1.0;
2549
2550         ReleaseVariableStats(vardata);
2551
2552         return (Selectivity) estfract;
2553 }
2554
2555
2556 /*-------------------------------------------------------------------------
2557  *
2558  * Support routines
2559  *
2560  *-------------------------------------------------------------------------
2561  */
2562
2563 /*
2564  * convert_to_scalar
2565  *        Convert non-NULL values of the indicated types to the comparison
2566  *        scale needed by scalarineqsel().
2567  *        Returns "true" if successful.
2568  *
2569  * XXX this routine is a hack: ideally we should look up the conversion
2570  * subroutines in pg_type.
2571  *
2572  * All numeric datatypes are simply converted to their equivalent
2573  * "double" values.  (NUMERIC values that are outside the range of "double"
2574  * are clamped to +/- HUGE_VAL.)
2575  *
2576  * String datatypes are converted by convert_string_to_scalar(),
2577  * which is explained below.  The reason why this routine deals with
2578  * three values at a time, not just one, is that we need it for strings.
2579  *
2580  * The bytea datatype is just enough different from strings that it has
2581  * to be treated separately.
2582  *
2583  * The several datatypes representing absolute times are all converted
2584  * to Timestamp, which is actually a double, and then we just use that
2585  * double value.  Note this will give correct results even for the "special"
2586  * values of Timestamp, since those are chosen to compare correctly;
2587  * see timestamp_cmp.
2588  *
2589  * The several datatypes representing relative times (intervals) are all
2590  * converted to measurements expressed in seconds.
2591  */
2592 static bool
2593 convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
2594                                   Datum lobound, Datum hibound, Oid boundstypid,
2595                                   double *scaledlobound, double *scaledhibound)
2596 {
2597         /*
2598          * Both the valuetypid and the boundstypid should exactly match the
2599          * declared input type(s) of the operator we are invoked for, so we just
2600          * error out if either is not recognized.
2601          *
2602          * XXX The histogram we are interpolating between points of could belong
2603          * to a column that's only binary-compatible with the declared type. In
2604          * essence we are assuming that the semantics of binary-compatible types
2605          * are enough alike that we can use a histogram generated with one type's
2606          * operators to estimate selectivity for the other's.  This is outright
2607          * wrong in some cases --- in particular signed versus unsigned
2608          * interpretation could trip us up.  But it's useful enough in the
2609          * majority of cases that we do it anyway.      Should think about more
2610          * rigorous ways to do it.
2611          */
2612         switch (valuetypid)
2613         {
2614                         /*
2615                          * Built-in numeric types
2616                          */
2617                 case BOOLOID:
2618                 case INT2OID:
2619                 case INT4OID:
2620                 case INT8OID:
2621                 case FLOAT4OID:
2622                 case FLOAT8OID:
2623                 case NUMERICOID:
2624                 case OIDOID:
2625                 case REGPROCOID:
2626                 case REGPROCEDUREOID:
2627                 case REGOPEROID:
2628                 case REGOPERATOROID:
2629                 case REGCLASSOID:
2630                 case REGTYPEOID:
2631                         *scaledvalue = convert_numeric_to_scalar(value, valuetypid);
2632                         *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid);
2633                         *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid);
2634                         return true;
2635
2636                         /*
2637                          * Built-in string types
2638                          */
2639                 case CHAROID:
2640                 case BPCHAROID:
2641                 case VARCHAROID:
2642                 case TEXTOID:
2643                 case NAMEOID:
2644                         {
2645                                 char       *valstr = convert_string_datum(value, valuetypid);
2646                                 char       *lostr = convert_string_datum(lobound, boundstypid);
2647                                 char       *histr = convert_string_datum(hibound, boundstypid);
2648
2649                                 convert_string_to_scalar(valstr, scaledvalue,
2650                                                                                  lostr, scaledlobound,
2651                                                                                  histr, scaledhibound);
2652                                 pfree(valstr);
2653                                 pfree(lostr);
2654                                 pfree(histr);
2655                                 return true;
2656                         }
2657
2658                         /*
2659                          * Built-in bytea type
2660                          */
2661                 case BYTEAOID:
2662                         {
2663                                 convert_bytea_to_scalar(value, scaledvalue,
2664                                                                                 lobound, scaledlobound,
2665                                                                                 hibound, scaledhibound);
2666                                 return true;
2667                         }
2668
2669                         /*
2670                          * Built-in time types
2671                          */
2672                 case TIMESTAMPOID:
2673                 case TIMESTAMPTZOID:
2674                 case ABSTIMEOID:
2675                 case DATEOID:
2676                 case INTERVALOID:
2677                 case RELTIMEOID:
2678                 case TINTERVALOID:
2679                 case TIMEOID:
2680                 case TIMETZOID:
2681                         *scaledvalue = convert_timevalue_to_scalar(value, valuetypid);
2682                         *scaledlobound = convert_timevalue_to_scalar(lobound, boundstypid);
2683                         *scaledhibound = convert_timevalue_to_scalar(hibound, boundstypid);
2684                         return true;
2685
2686                         /*
2687                          * Built-in network types
2688                          */
2689                 case INETOID:
2690                 case CIDROID:
2691                 case MACADDROID:
2692                         *scaledvalue = convert_network_to_scalar(value, valuetypid);
2693                         *scaledlobound = convert_network_to_scalar(lobound, boundstypid);
2694                         *scaledhibound = convert_network_to_scalar(hibound, boundstypid);
2695                         return true;
2696         }
2697         /* Don't know how to convert */
2698         *scaledvalue = *scaledlobound = *scaledhibound = 0;
2699         return false;
2700 }
2701
2702 /*
2703  * Do convert_to_scalar()'s work for any numeric data type.
2704  */
2705 static double
2706 convert_numeric_to_scalar(Datum value, Oid typid)
2707 {
2708         switch (typid)
2709         {
2710                 case BOOLOID:
2711                         return (double) DatumGetBool(value);
2712                 case INT2OID:
2713                         return (double) DatumGetInt16(value);
2714                 case INT4OID:
2715                         return (double) DatumGetInt32(value);
2716                 case INT8OID:
2717                         return (double) DatumGetInt64(value);
2718                 case FLOAT4OID:
2719                         return (double) DatumGetFloat4(value);
2720                 case FLOAT8OID:
2721                         return (double) DatumGetFloat8(value);
2722                 case NUMERICOID:
2723                         /* Note: out-of-range values will be clamped to +-HUGE_VAL */
2724                         return (double)
2725                                 DatumGetFloat8(DirectFunctionCall1(numeric_float8_no_overflow,
2726                                                                                                    value));
2727                 case OIDOID:
2728                 case REGPROCOID:
2729                 case REGPROCEDUREOID:
2730                 case REGOPEROID:
2731                 case REGOPERATOROID:
2732                 case REGCLASSOID:
2733                 case REGTYPEOID:
2734                         /* we can treat OIDs as integers... */
2735                         return (double) DatumGetObjectId(value);
2736         }
2737
2738         /*
2739          * Can't get here unless someone tries to use scalarltsel/scalargtsel on
2740          * an operator with one numeric and one non-numeric operand.
2741          */
2742         elog(ERROR, "unsupported type: %u", typid);
2743         return 0;
2744 }
2745
2746 /*
2747  * Do convert_to_scalar()'s work for any character-string data type.
2748  *
2749  * String datatypes are converted to a scale that ranges from 0 to 1,
2750  * where we visualize the bytes of the string as fractional digits.
2751  *
2752  * We do not want the base to be 256, however, since that tends to
2753  * generate inflated selectivity estimates; few databases will have
2754  * occurrences of all 256 possible byte values at each position.
2755  * Instead, use the smallest and largest byte values seen in the bounds
2756  * as the estimated range for each byte, after some fudging to deal with
2757  * the fact that we probably aren't going to see the full range that way.
2758  *
2759  * An additional refinement is that we discard any common prefix of the
2760  * three strings before computing the scaled values.  This allows us to
2761  * "zoom in" when we encounter a narrow data range.  An example is a phone
2762  * number database where all the values begin with the same area code.
2763  * (Actually, the bounds will be adjacent histogram-bin-boundary values,
2764  * so this is more likely to happen than you might think.)
2765  */
2766 static void
2767 convert_string_to_scalar(char *value,
2768                                                  double *scaledvalue,
2769                                                  char *lobound,
2770                                                  double *scaledlobound,
2771                                                  char *hibound,
2772                                                  double *scaledhibound)
2773 {
2774         int                     rangelo,
2775                                 rangehi;
2776         char       *sptr;
2777
2778         rangelo = rangehi = (unsigned char) hibound[0];
2779         for (sptr = lobound; *sptr; sptr++)
2780         {
2781                 if (rangelo > (unsigned char) *sptr)
2782                         rangelo = (unsigned char) *sptr;
2783                 if (rangehi < (unsigned char) *sptr)
2784                         rangehi = (unsigned char) *sptr;
2785         }
2786         for (sptr = hibound; *sptr; sptr++)
2787         {
2788                 if (rangelo > (unsigned char) *sptr)
2789                         rangelo = (unsigned char) *sptr;
2790                 if (rangehi < (unsigned char) *sptr)
2791                         rangehi = (unsigned char) *sptr;
2792         }
2793         /* If range includes any upper-case ASCII chars, make it include all */
2794         if (rangelo <= 'Z' && rangehi >= 'A')
2795         {
2796                 if (rangelo > 'A')
2797                         rangelo = 'A';
2798                 if (rangehi < 'Z')
2799                         rangehi = 'Z';
2800         }
2801         /* Ditto lower-case */
2802         if (rangelo <= 'z' && rangehi >= 'a')
2803         {
2804                 if (rangelo > 'a')
2805                         rangelo = 'a';
2806                 if (rangehi < 'z')
2807                         rangehi = 'z';
2808         }
2809         /* Ditto digits */
2810         if (rangelo <= '9' && rangehi >= '0')
2811         {
2812                 if (rangelo > '0')
2813                         rangelo = '0';
2814                 if (rangehi < '9')
2815                         rangehi = '9';
2816         }
2817
2818         /*
2819          * If range includes less than 10 chars, assume we have not got enough
2820          * data, and make it include regular ASCII set.
2821          */
2822         if (rangehi - rangelo < 9)
2823         {
2824                 rangelo = ' ';
2825                 rangehi = 127;
2826         }
2827
2828         /*
2829          * Now strip any common prefix of the three strings.
2830          */
2831         while (*lobound)
2832         {
2833                 if (*lobound != *hibound || *lobound != *value)
2834                         break;
2835                 lobound++, hibound++, value++;
2836         }
2837
2838         /*
2839          * Now we can do the conversions.
2840          */
2841         *scaledvalue = convert_one_string_to_scalar(value, rangelo, rangehi);
2842         *scaledlobound = convert_one_string_to_scalar(lobound, rangelo, rangehi);
2843         *scaledhibound = convert_one_string_to_scalar(hibound, rangelo, rangehi);
2844 }
2845
2846 static double
2847 convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
2848 {
2849         int                     slen = strlen(value);
2850         double          num,
2851                                 denom,
2852                                 base;
2853
2854         if (slen <= 0)
2855                 return 0.0;                             /* empty string has scalar value 0 */
2856
2857         /*
2858          * Since base is at least 10, need not consider more than about 20 chars
2859          */
2860         if (slen > 20)
2861                 slen = 20;
2862
2863         /* Convert initial characters to fraction */
2864         base = rangehi - rangelo + 1;
2865         num = 0.0;
2866         denom = base;
2867         while (slen-- > 0)
2868         {
2869                 int                     ch = (unsigned char) *value++;
2870
2871                 if (ch < rangelo)
2872                         ch = rangelo - 1;
2873                 else if (ch > rangehi)
2874                         ch = rangehi + 1;
2875                 num += ((double) (ch - rangelo)) / denom;
2876                 denom *= base;
2877         }
2878
2879         return num;
2880 }
2881
2882 /*
2883  * Convert a string-type Datum into a palloc'd, null-terminated string.
2884  *
2885  * When using a non-C locale, we must pass the string through strxfrm()
2886  * before continuing, so as to generate correct locale-specific results.
2887  */
2888 static char *
2889 convert_string_datum(Datum value, Oid typid)
2890 {
2891         char       *val;
2892
2893         switch (typid)
2894         {
2895                 case CHAROID:
2896                         val = (char *) palloc(2);
2897                         val[0] = DatumGetChar(value);
2898                         val[1] = '\0';
2899                         break;
2900                 case BPCHAROID:
2901                 case VARCHAROID:
2902                 case TEXTOID:
2903                         {
2904                                 char       *str = (char *) VARDATA(DatumGetPointer(value));
2905                                 int                     strlength = VARSIZE(DatumGetPointer(value)) - VARHDRSZ;
2906
2907                                 val = (char *) palloc(strlength + 1);
2908                                 memcpy(val, str, strlength);
2909                                 val[strlength] = '\0';
2910                                 break;
2911                         }
2912                 case NAMEOID:
2913                         {
2914                                 NameData   *nm = (NameData *) DatumGetPointer(value);
2915
2916                                 val = pstrdup(NameStr(*nm));
2917                                 break;
2918                         }
2919                 default:
2920
2921                         /*
2922                          * Can't get here unless someone tries to use scalarltsel on an
2923                          * operator with one string and one non-string operand.
2924                          */
2925                         elog(ERROR, "unsupported type: %u", typid);
2926                         return NULL;
2927         }
2928
2929         if (!lc_collate_is_c())
2930         {
2931                 char       *xfrmstr;
2932                 size_t          xfrmlen;
2933                 size_t          xfrmlen2;
2934
2935                 /*
2936                  * Note: originally we guessed at a suitable output buffer size, and
2937                  * only needed to call strxfrm twice if our guess was too small.
2938                  * However, it seems that some versions of Solaris have buggy strxfrm
2939                  * that can write past the specified buffer length in that scenario.
2940                  * So, do it the dumb way for portability.
2941                  *
2942                  * Yet other systems (e.g., glibc) sometimes return a smaller value
2943                  * from the second call than the first; thus the Assert must be <= not
2944                  * == as you'd expect.  Can't any of these people program their way
2945                  * out of a paper bag?
2946                  */
2947                 xfrmlen = strxfrm(NULL, val, 0);
2948                 xfrmstr = (char *) palloc(xfrmlen + 1);
2949                 xfrmlen2 = strxfrm(xfrmstr, val, xfrmlen + 1);
2950                 Assert(xfrmlen2 <= xfrmlen);
2951                 pfree(val);
2952                 val = xfrmstr;
2953         }
2954
2955         return val;
2956 }
2957
2958 /*
2959  * Do convert_to_scalar()'s work for any bytea data type.
2960  *
2961  * Very similar to convert_string_to_scalar except we can't assume
2962  * null-termination and therefore pass explicit lengths around.
2963  *
2964  * Also, assumptions about likely "normal" ranges of characters have been
2965  * removed - a data range of 0..255 is always used, for now.  (Perhaps
2966  * someday we will add information about actual byte data range to
2967  * pg_statistic.)
2968  */
2969 static void
2970 convert_bytea_to_scalar(Datum value,
2971                                                 double *scaledvalue,
2972                                                 Datum lobound,
2973                                                 double *scaledlobound,
2974                                                 Datum hibound,
2975                                                 double *scaledhibound)
2976 {
2977         int                     rangelo,
2978                                 rangehi,
2979                                 valuelen = VARSIZE(DatumGetPointer(value)) - VARHDRSZ,
2980                                 loboundlen = VARSIZE(DatumGetPointer(lobound)) - VARHDRSZ,
2981                                 hiboundlen = VARSIZE(DatumGetPointer(hibound)) - VARHDRSZ,
2982                                 i,
2983                                 minlen;
2984         unsigned char *valstr = (unsigned char *) VARDATA(DatumGetPointer(value)),
2985                            *lostr = (unsigned char *) VARDATA(DatumGetPointer(lobound)),
2986                            *histr = (unsigned char *) VARDATA(DatumGetPointer(hibound));
2987
2988         /*
2989          * Assume bytea data is uniformly distributed across all byte values.
2990          */
2991         rangelo = 0;
2992         rangehi = 255;
2993
2994         /*
2995          * Now strip any common prefix of the three strings.
2996          */
2997         minlen = Min(Min(valuelen, loboundlen), hiboundlen);
2998         for (i = 0; i < minlen; i++)
2999         {
3000                 if (*lostr != *histr || *lostr != *valstr)
3001                         break;
3002                 lostr++, histr++, valstr++;
3003                 loboundlen--, hiboundlen--, valuelen--;
3004         }
3005
3006         /*
3007          * Now we can do the conversions.
3008          */
3009         *scaledvalue = convert_one_bytea_to_scalar(valstr, valuelen, rangelo, rangehi);
3010         *scaledlobound = convert_one_bytea_to_scalar(lostr, loboundlen, rangelo, rangehi);
3011         *scaledhibound = convert_one_bytea_to_scalar(histr, hiboundlen, rangelo, rangehi);
3012 }
3013
3014 static double
3015 convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
3016                                                         int rangelo, int rangehi)
3017 {
3018         double          num,
3019                                 denom,
3020                                 base;
3021
3022         if (valuelen <= 0)
3023                 return 0.0;                             /* empty string has scalar value 0 */
3024
3025         /*
3026          * Since base is 256, need not consider more than about 10 chars (even
3027          * this many seems like overkill)
3028          */
3029         if (valuelen > 10)
3030                 valuelen = 10;
3031
3032         /* Convert initial characters to fraction */
3033         base = rangehi - rangelo + 1;
3034         num = 0.0;
3035         denom = base;
3036         while (valuelen-- > 0)
3037         {
3038                 int                     ch = *value++;
3039
3040                 if (ch < rangelo)
3041                         ch = rangelo - 1;
3042                 else if (ch > rangehi)
3043                         ch = rangehi + 1;
3044                 num += ((double) (ch - rangelo)) / denom;
3045                 denom *= base;
3046         }
3047
3048         return num;
3049 }
3050
3051 /*
3052  * Do convert_to_scalar()'s work for any timevalue data type.
3053  */
3054 static double
3055 convert_timevalue_to_scalar(Datum value, Oid typid)
3056 {
3057         switch (typid)
3058         {
3059                 case TIMESTAMPOID:
3060                         return DatumGetTimestamp(value);
3061                 case TIMESTAMPTZOID:
3062                         return DatumGetTimestampTz(value);
3063                 case ABSTIMEOID:
3064                         return DatumGetTimestamp(DirectFunctionCall1(abstime_timestamp,
3065                                                                                                                  value));
3066                 case DATEOID:
3067                         return DatumGetTimestamp(DirectFunctionCall1(date_timestamp,
3068                                                                                                                  value));
3069                 case INTERVALOID:
3070                         {
3071                                 Interval   *interval = DatumGetIntervalP(value);
3072
3073                                 /*
3074                                  * Convert the month part of Interval to days using assumed
3075                                  * average month length of 365.25/12.0 days.  Not too
3076                                  * accurate, but plenty good enough for our purposes.
3077                                  */
3078 #ifdef HAVE_INT64_TIMESTAMP
3079                                 return interval->time + interval->day * (double) USECS_PER_DAY +
3080                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * USECS_PER_DAY);
3081 #else
3082                                 return interval->time + interval->day * SECS_PER_DAY +
3083                                         interval->month * ((DAYS_PER_YEAR / (double) MONTHS_PER_YEAR) * (double) SECS_PER_DAY);
3084 #endif
3085                         }
3086                 case RELTIMEOID:
3087 #ifdef HAVE_INT64_TIMESTAMP
3088                         return (DatumGetRelativeTime(value) * 1000000.0);
3089 #else
3090                         return DatumGetRelativeTime(value);
3091 #endif
3092                 case TINTERVALOID:
3093                         {
3094                                 TimeInterval tinterval = DatumGetTimeInterval(value);
3095
3096 #ifdef HAVE_INT64_TIMESTAMP
3097                                 if (tinterval->status != 0)
3098                                         return ((tinterval->data[1] - tinterval->data[0]) * 1000000.0);
3099 #else
3100                                 if (tinterval->status != 0)
3101                                         return tinterval->data[1] - tinterval->data[0];
3102 #endif
3103                                 return 0;               /* for lack of a better idea */
3104                         }
3105                 case TIMEOID:
3106                         return DatumGetTimeADT(value);
3107                 case TIMETZOID:
3108                         {
3109                                 TimeTzADT  *timetz = DatumGetTimeTzADTP(value);
3110
3111                                 /* use GMT-equivalent time */
3112 #ifdef HAVE_INT64_TIMESTAMP
3113                                 return (double) (timetz->time + (timetz->zone * 1000000.0));
3114 #else
3115                                 return (double) (timetz->time + timetz->zone);
3116 #endif
3117                         }
3118         }
3119
3120         /*
3121          * Can't get here unless someone tries to use scalarltsel/scalargtsel on
3122          * an operator with one timevalue and one non-timevalue operand.
3123          */
3124         elog(ERROR, "unsupported type: %u", typid);
3125         return 0;
3126 }
3127
3128
3129 /*
3130  * get_restriction_variable
3131  *              Examine the args of a restriction clause to see if it's of the
3132  *              form (variable op pseudoconstant) or (pseudoconstant op variable),
3133  *              where "variable" could be either a Var or an expression in vars of a
3134  *              single relation.  If so, extract information about the variable,
3135  *              and also indicate which side it was on and the other argument.
3136  *
3137  * Inputs:
3138  *      root: the planner info
3139  *      args: clause argument list
3140  *      varRelid: see specs for restriction selectivity functions
3141  *
3142  * Outputs: (these are valid only if TRUE is returned)
3143  *      *vardata: gets information about variable (see examine_variable)
3144  *      *other: gets other clause argument, aggressively reduced to a constant
3145  *      *varonleft: set TRUE if variable is on the left, FALSE if on the right
3146  *
3147  * Returns TRUE if a variable is identified, otherwise FALSE.
3148  *
3149  * Note: if there are Vars on both sides of the clause, we must fail, because
3150  * callers are expecting that the other side will act like a pseudoconstant.
3151  */
3152 bool
3153 get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
3154                                                  VariableStatData *vardata, Node **other,
3155                                                  bool *varonleft)
3156 {
3157         Node       *left,
3158                            *right;
3159         VariableStatData rdata;
3160
3161         /* Fail if not a binary opclause (probably shouldn't happen) */
3162         if (list_length(args) != 2)
3163                 return false;
3164
3165         left = (Node *) linitial(args);
3166         right = (Node *) lsecond(args);
3167
3168         /*
3169          * Examine both sides.  Note that when varRelid is nonzero, Vars of other
3170          * relations will be treated as pseudoconstants.
3171          */
3172         examine_variable(root, left, varRelid, vardata);
3173         examine_variable(root, right, varRelid, &rdata);
3174
3175         /*
3176          * If one side is a variable and the other not, we win.
3177          */
3178         if (vardata->rel && rdata.rel == NULL)
3179         {
3180                 *varonleft = true;
3181                 *other = estimate_expression_value(rdata.var);
3182                 /* Assume we need no ReleaseVariableStats(rdata) here */
3183                 return true;
3184         }
3185
3186         if (vardata->rel == NULL && rdata.rel)
3187         {
3188                 *varonleft = false;
3189                 *other = estimate_expression_value(vardata->var);
3190                 /* Assume we need no ReleaseVariableStats(*vardata) here */
3191                 *vardata = rdata;
3192                 return true;
3193         }
3194
3195         /* Ooops, clause has wrong structure (probably var op var) */
3196         ReleaseVariableStats(*vardata);
3197         ReleaseVariableStats(rdata);
3198
3199         return false;
3200 }
3201
3202 /*
3203  * get_join_variables
3204  *              Apply examine_variable() to each side of a join clause.
3205  */
3206 static void
3207 get_join_variables(PlannerInfo *root, List *args,
3208                                    VariableStatData *vardata1, VariableStatData *vardata2)
3209 {
3210         Node       *left,
3211                            *right;
3212
3213         if (list_length(args) != 2)
3214                 elog(ERROR, "join operator should take two arguments");
3215
3216         left = (Node *) linitial(args);
3217         right = (Node *) lsecond(args);
3218
3219         examine_variable(root, left, 0, vardata1);
3220         examine_variable(root, right, 0, vardata2);
3221 }
3222
3223 /*
3224  * examine_variable
3225  *              Try to look up statistical data about an expression.
3226  *              Fill in a VariableStatData struct to describe the expression.
3227  *
3228  * Inputs:
3229  *      root: the planner info
3230  *      node: the expression tree to examine
3231  *      varRelid: see specs for restriction selectivity functions
3232  *
3233  * Outputs: *vardata is filled as follows:
3234  *      var: the input expression (with any binary relabeling stripped, if
3235  *              it is or contains a variable; but otherwise the type is preserved)
3236  *      rel: RelOptInfo for relation containing variable; NULL if expression
3237  *              contains no Vars (NOTE this could point to a RelOptInfo of a
3238  *              subquery, not one in the current query).
3239  *      statsTuple: the pg_statistic entry for the variable, if one exists;
3240  *              otherwise NULL.
3241  *      vartype: exposed type of the expression; this should always match
3242  *              the declared input type of the operator we are estimating for.
3243  *      atttype, atttypmod: type data to pass to get_attstatsslot().  This is
3244  *              commonly the same as the exposed type of the variable argument,
3245  *              but can be different in binary-compatible-type cases.
3246  *
3247  * Caller is responsible for doing ReleaseVariableStats() before exiting.
3248  */
3249 static void
3250 examine_variable(PlannerInfo *root, Node *node, int varRelid,
3251                                  VariableStatData *vardata)
3252 {
3253         Node       *basenode;
3254         Relids          varnos;
3255         RelOptInfo *onerel;
3256
3257         /* Make sure we don't return dangling pointers in vardata */
3258         MemSet(vardata, 0, sizeof(VariableStatData));
3259
3260         /* Save the exposed type of the expression */
3261         vardata->vartype = exprType(node);
3262
3263         /* Look inside any binary-compatible relabeling */
3264
3265         if (IsA(node, RelabelType))
3266                 basenode = (Node *) ((RelabelType *) node)->arg;
3267         else
3268                 basenode = node;
3269
3270         /* Fast path for a simple Var */
3271
3272         if (IsA(basenode, Var) &&
3273                 (varRelid == 0 || varRelid == ((Var *) basenode)->varno))
3274         {
3275                 Var                *var = (Var *) basenode;
3276                 Oid                     relid;
3277
3278                 vardata->var = basenode;        /* return Var without relabeling */
3279                 vardata->rel = find_base_rel(root, var->varno);
3280                 vardata->atttype = var->vartype;
3281                 vardata->atttypmod = var->vartypmod;
3282
3283                 relid = getrelid(var->varno, root->parse->rtable);
3284
3285                 if (OidIsValid(relid))
3286                 {
3287                         vardata->statsTuple = SearchSysCache(STATRELATT,
3288                                                                                                  ObjectIdGetDatum(relid),
3289                                                                                                  Int16GetDatum(var->varattno),
3290                                                                                                  0, 0);
3291                 }
3292                 else
3293                 {
3294                         /*
3295                          * XXX This means the Var comes from a JOIN or sub-SELECT. Later
3296                          * add code to dig down into the join etc and see if we can trace
3297                          * the variable to something with stats.  (But beware of
3298                          * sub-SELECTs with DISTINCT/GROUP BY/etc.      Perhaps there are no
3299                          * cases where this would really be useful, because we'd have
3300                          * flattened the subselect if it is??)
3301                          */
3302                 }
3303
3304                 return;
3305         }
3306
3307         /*
3308          * Okay, it's a more complicated expression.  Determine variable
3309          * membership.  Note that when varRelid isn't zero, only vars of that
3310          * relation are considered "real" vars.
3311          */
3312         varnos = pull_varnos(basenode);
3313
3314         onerel = NULL;
3315
3316         switch (bms_membership(varnos))
3317         {
3318                 case BMS_EMPTY_SET:
3319                         /* No Vars at all ... must be pseudo-constant clause */
3320                         break;
3321                 case BMS_SINGLETON:
3322                         if (varRelid == 0 || bms_is_member(varRelid, varnos))
3323                         {
3324                                 onerel = find_base_rel(root,
3325                                            (varRelid ? varRelid : bms_singleton_member(varnos)));
3326                                 vardata->rel = onerel;
3327                                 node = basenode;        /* strip any relabeling */
3328                         }
3329                         /* else treat it as a constant */
3330                         break;
3331                 case BMS_MULTIPLE:
3332                         if (varRelid == 0)
3333                         {
3334                                 /* treat it as a variable of a join relation */
3335                                 vardata->rel = find_join_rel(root, varnos);
3336                                 node = basenode;        /* strip any relabeling */
3337                         }
3338                         else if (bms_is_member(varRelid, varnos))
3339                         {
3340                                 /* ignore the vars belonging to other relations */
3341                                 vardata->rel = find_base_rel(root, varRelid);
3342                                 node = basenode;        /* strip any relabeling */
3343                                 /* note: no point in expressional-index search here */
3344                         }
3345                         /* else treat it as a constant */
3346                         break;
3347         }
3348
3349         bms_free(varnos);
3350
3351         vardata->var = node;
3352         vardata->atttype = exprType(node);
3353         vardata->atttypmod = exprTypmod(node);
3354
3355         if (onerel)
3356         {
3357                 /*
3358                  * We have an expression in vars of a single relation.  Try to match
3359                  * it to expressional index columns, in hopes of finding some
3360                  * statistics.
3361                  *
3362                  * XXX it's conceivable that there are multiple matches with different
3363                  * index opclasses; if so, we need to pick one that matches the
3364                  * operator we are estimating for.      FIXME later.
3365                  */
3366                 ListCell   *ilist;
3367
3368                 foreach(ilist, onerel->indexlist)
3369                 {
3370                         IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
3371                         ListCell   *indexpr_item;
3372                         int                     pos;
3373
3374                         indexpr_item = list_head(index->indexprs);
3375                         if (indexpr_item == NULL)
3376                                 continue;               /* no expressions here... */
3377
3378                         /*
3379                          * Ignore partial indexes since they probably don't reflect
3380                          * whole-relation statistics.  Possibly reconsider this later.
3381                          */
3382                         if (index->indpred)
3383                                 continue;
3384
3385                         for (pos = 0; pos < index->ncolumns; pos++)
3386                         {
3387                                 if (index->indexkeys[pos] == 0)
3388                                 {
3389                                         Node       *indexkey;
3390
3391                                         if (indexpr_item == NULL)
3392                                                 elog(ERROR, "too few entries in indexprs list");
3393                                         indexkey = (Node *) lfirst(indexpr_item);
3394                                         if (indexkey && IsA(indexkey, RelabelType))
3395                                                 indexkey = (Node *) ((RelabelType *) indexkey)->arg;
3396                                         if (equal(node, indexkey))
3397                                         {
3398                                                 /*
3399                                                  * Found a match ... is it a unique index? Tests here
3400                                                  * should match has_unique_index().
3401                                                  */
3402                                                 if (index->unique &&
3403                                                         index->ncolumns == 1 &&
3404                                                         index->indpred == NIL)
3405                                                         vardata->isunique = true;
3406                                                 /* Has it got stats? */
3407                                                 vardata->statsTuple = SearchSysCache(STATRELATT,
3408                                                                                    ObjectIdGetDatum(index->indexoid),
3409                                                                                                           Int16GetDatum(pos + 1),
3410                                                                                                                          0, 0);
3411                                                 if (vardata->statsTuple)
3412                                                         break;
3413                                         }
3414                                         indexpr_item = lnext(indexpr_item);
3415                                 }
3416                         }
3417                         if (vardata->statsTuple)
3418                                 break;
3419                 }
3420         }
3421 }
3422
3423 /*
3424  * get_variable_numdistinct
3425  *        Estimate the number of distinct values of a variable.
3426  *
3427  * vardata: results of examine_variable
3428  *
3429  * NB: be careful to produce an integral result, since callers may compare
3430  * the result to exact integer counts.
3431  */
3432 static double
3433 get_variable_numdistinct(VariableStatData *vardata)
3434 {
3435         double          stadistinct;
3436         double          ntuples;
3437
3438         /*
3439          * Determine the stadistinct value to use.      There are cases where we can
3440          * get an estimate even without a pg_statistic entry, or can get a better
3441          * value than is in pg_statistic.
3442          */
3443         if (HeapTupleIsValid(vardata->statsTuple))
3444         {
3445                 /* Use the pg_statistic entry */
3446                 Form_pg_statistic stats;
3447
3448                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3449                 stadistinct = stats->stadistinct;
3450         }
3451         else if (vardata->vartype == BOOLOID)
3452         {
3453                 /*
3454                  * Special-case boolean columns: presumably, two distinct values.
3455                  *
3456                  * Are there any other datatypes we should wire in special estimates
3457                  * for?
3458                  */
3459                 stadistinct = 2.0;
3460         }
3461         else
3462         {
3463                 /*
3464                  * We don't keep statistics for system columns, but in some cases we
3465                  * can infer distinctness anyway.
3466                  */
3467                 if (vardata->var && IsA(vardata->var, Var))
3468                 {
3469                         switch (((Var *) vardata->var)->varattno)
3470                         {
3471                                 case ObjectIdAttributeNumber:
3472                                 case SelfItemPointerAttributeNumber:
3473                                         stadistinct = -1.0; /* unique */
3474                                         break;
3475                                 case TableOidAttributeNumber:
3476                                         stadistinct = 1.0;      /* only 1 value */
3477                                         break;
3478                                 default:
3479                                         stadistinct = 0.0;      /* means "unknown" */
3480                                         break;
3481                         }
3482                 }
3483                 else
3484                         stadistinct = 0.0;      /* means "unknown" */
3485
3486                 /*
3487                  * XXX consider using estimate_num_groups on expressions?
3488                  */
3489         }
3490
3491         /*
3492          * If there is a unique index for the variable, assume it is unique no
3493          * matter what pg_statistic says (the statistics could be out of date).
3494          * Can skip search if we already think it's unique.
3495          */
3496         if (stadistinct != -1.0)
3497         {
3498                 if (vardata->isunique)
3499                         stadistinct = -1.0;
3500                 else if (vardata->var && IsA(vardata->var, Var) &&
3501                                  vardata->rel &&
3502                                  has_unique_index(vardata->rel,
3503                                                                   ((Var *) vardata->var)->varattno))
3504                         stadistinct = -1.0;
3505         }
3506
3507         /*
3508          * If we had an absolute estimate, use that.
3509          */
3510         if (stadistinct > 0.0)
3511                 return stadistinct;
3512
3513         /*
3514          * Otherwise we need to get the relation size; punt if not available.
3515          */
3516         if (vardata->rel == NULL)
3517                 return DEFAULT_NUM_DISTINCT;
3518         ntuples = vardata->rel->tuples;
3519         if (ntuples <= 0.0)
3520                 return DEFAULT_NUM_DISTINCT;
3521
3522         /*
3523          * If we had a relative estimate, use that.
3524          */
3525         if (stadistinct < 0.0)
3526                 return floor((-stadistinct * ntuples) + 0.5);
3527
3528         /*
3529          * With no data, estimate ndistinct = ntuples if the table is small, else
3530          * use default.
3531          */
3532         if (ntuples < DEFAULT_NUM_DISTINCT)
3533                 return ntuples;
3534
3535         return DEFAULT_NUM_DISTINCT;
3536 }
3537
3538 /*
3539  * get_variable_maximum
3540  *              Estimate the maximum value of the specified variable.
3541  *              If successful, store value in *max and return TRUE.
3542  *              If no data available, return FALSE.
3543  *
3544  * sortop is the "<" comparison operator to use.  (To extract the
3545  * minimum instead of the maximum, just pass the ">" operator instead.)
3546  */
3547 static bool
3548 get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
3549                                          Oid sortop, Datum *max)
3550 {
3551         Datum           tmax = 0;
3552         bool            have_max = false;
3553         Form_pg_statistic stats;
3554         int16           typLen;
3555         bool            typByVal;
3556         Datum      *values;
3557         int                     nvalues;
3558         int                     i;
3559
3560         if (!HeapTupleIsValid(vardata->statsTuple))
3561         {
3562                 /* no stats available, so default result */
3563                 return false;
3564         }
3565         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
3566
3567         get_typlenbyval(vardata->atttype, &typLen, &typByVal);
3568
3569         /*
3570          * If there is a histogram, grab the last or first value as appropriate.
3571          *
3572          * If there is a histogram that is sorted with some other operator than
3573          * the one we want, fail --- this suggests that there is data we can't
3574          * use.
3575          */
3576         if (get_attstatsslot(vardata->statsTuple,
3577                                                  vardata->atttype, vardata->atttypmod,
3578                                                  STATISTIC_KIND_HISTOGRAM, sortop,
3579                                                  &values, &nvalues,
3580                                                  NULL, NULL))
3581         {
3582                 if (nvalues > 0)
3583                 {
3584                         tmax = datumCopy(values[nvalues - 1], typByVal, typLen);
3585                         have_max = true;
3586                 }
3587                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3588         }
3589         else
3590         {
3591                 Oid                     rsortop = get_commutator(sortop);
3592
3593                 if (OidIsValid(rsortop) &&
3594                         get_attstatsslot(vardata->statsTuple,
3595                                                          vardata->atttype, vardata->atttypmod,
3596                                                          STATISTIC_KIND_HISTOGRAM, rsortop,
3597                                                          &values, &nvalues,
3598                                                          NULL, NULL))
3599                 {
3600                         if (nvalues > 0)
3601                         {
3602                                 tmax = datumCopy(values[0], typByVal, typLen);
3603                                 have_max = true;
3604                         }
3605                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3606                 }
3607                 else if (get_attstatsslot(vardata->statsTuple,
3608                                                                   vardata->atttype, vardata->atttypmod,
3609                                                                   STATISTIC_KIND_HISTOGRAM, InvalidOid,
3610                                                                   &values, &nvalues,
3611                                                                   NULL, NULL))
3612                 {
3613                         free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3614                         return false;
3615                 }
3616         }
3617
3618         /*
3619          * If we have most-common-values info, look for a large MCV.  This is
3620          * needed even if we also have a histogram, since the histogram excludes
3621          * the MCVs.  However, usually the MCVs will not be the extreme values, so
3622          * avoid unnecessary data copying.
3623          */
3624         if (get_attstatsslot(vardata->statsTuple,
3625                                                  vardata->atttype, vardata->atttypmod,
3626                                                  STATISTIC_KIND_MCV, InvalidOid,
3627                                                  &values, &nvalues,
3628                                                  NULL, NULL))
3629         {
3630                 bool            large_mcv = false;
3631                 FmgrInfo        opproc;
3632
3633                 fmgr_info(get_opcode(sortop), &opproc);
3634
3635                 for (i = 0; i < nvalues; i++)
3636                 {
3637                         if (!have_max)
3638                         {
3639                                 tmax = values[i];
3640                                 large_mcv = have_max = true;
3641                         }
3642                         else if (DatumGetBool(FunctionCall2(&opproc, tmax, values[i])))
3643                         {
3644                                 tmax = values[i];
3645                                 large_mcv = true;
3646                         }
3647                 }
3648                 if (large_mcv)
3649                         tmax = datumCopy(tmax, typByVal, typLen);
3650                 free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
3651         }
3652
3653         *max = tmax;
3654         return have_max;
3655 }
3656
3657
3658 /*-------------------------------------------------------------------------
3659  *
3660  * Pattern analysis functions
3661  *
3662  * These routines support analysis of LIKE and regular-expression patterns
3663  * by the planner/optimizer.  It's important that they agree with the
3664  * regular-expression code in backend/regex/ and the LIKE code in
3665  * backend/utils/adt/like.c.
3666  *
3667  * Note that the prefix-analysis functions are called from
3668  * backend/optimizer/path/indxpath.c as well as from routines in this file.
3669  *
3670  *-------------------------------------------------------------------------
3671  */
3672
3673 /*
3674  * Extract the fixed prefix, if any, for a pattern.
3675  *
3676  * *prefix is set to a palloc'd prefix string (in the form of a Const node),
3677  *      or to NULL if no fixed prefix exists for the pattern.
3678  * *rest is set to a palloc'd Const representing the remainder of the pattern
3679  *      after the portion describing the fixed prefix.
3680  * Each of these has the same type (TEXT or BYTEA) as the given pattern Const.
3681  *
3682  * The return value distinguishes no fixed prefix, a partial prefix,
3683  * or an exact-match-only pattern.
3684  */
3685
3686 static Pattern_Prefix_Status
3687 like_fixed_prefix(Const *patt_const, bool case_insensitive,
3688                                   Const **prefix_const, Const **rest_const)
3689 {
3690         char       *match;
3691         char       *patt;
3692         int                     pattlen;
3693         char       *rest;
3694         Oid                     typeid = patt_const->consttype;
3695         int                     pos,
3696                                 match_pos;
3697
3698         /* the right-hand const is type text or bytea */
3699         Assert(typeid == BYTEAOID || typeid == TEXTOID);
3700
3701         if (typeid == BYTEAOID && case_insensitive)
3702                 ereport(ERROR,
3703                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3704                    errmsg("case insensitive matching not supported on type bytea")));
3705
3706         if (typeid != BYTEAOID)
3707         {
3708                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3709                 pattlen = strlen(patt);
3710         }
3711         else
3712         {
3713                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
3714
3715                 pattlen = VARSIZE(bstr) - VARHDRSZ;
3716                 patt = (char *) palloc(pattlen);
3717                 memcpy(patt, VARDATA(bstr), pattlen);
3718                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
3719                         pfree(bstr);
3720         }
3721
3722         match = palloc(pattlen + 1);
3723         match_pos = 0;
3724         for (pos = 0; pos < pattlen; pos++)
3725         {
3726                 /* % and _ are wildcard characters in LIKE */
3727                 if (patt[pos] == '%' ||
3728                         patt[pos] == '_')
3729                         break;
3730
3731                 /* Backslash escapes the next character */
3732                 if (patt[pos] == '\\')
3733                 {
3734                         pos++;
3735                         if (pos >= pattlen)
3736                                 break;
3737                 }
3738
3739                 /*
3740                  * XXX I suspect isalpha() is not an adequately locale-sensitive test
3741                  * for characters that can vary under case folding?
3742                  */
3743                 if (case_insensitive && isalpha((unsigned char) patt[pos]))
3744                         break;
3745
3746                 /*
3747                  * NOTE: this code used to think that %% meant a literal %, but
3748                  * textlike() itself does not think that, and the SQL92 spec doesn't
3749                  * say any such thing either.
3750                  */
3751                 match[match_pos++] = patt[pos];
3752         }
3753
3754         match[match_pos] = '\0';
3755         rest = &patt[pos];
3756
3757         if (typeid != BYTEAOID)
3758         {
3759                 *prefix_const = string_to_const(match, typeid);
3760                 *rest_const = string_to_const(rest, typeid);
3761         }
3762         else
3763         {
3764                 *prefix_const = string_to_bytea_const(match, match_pos);
3765                 *rest_const = string_to_bytea_const(rest, pattlen - pos);
3766         }
3767
3768         pfree(patt);
3769         pfree(match);
3770
3771         /* in LIKE, an empty pattern is an exact match! */
3772         if (pos == pattlen)
3773                 return Pattern_Prefix_Exact;    /* reached end of pattern, so exact */
3774
3775         if (match_pos > 0)
3776                 return Pattern_Prefix_Partial;
3777
3778         return Pattern_Prefix_None;
3779 }
3780
3781 static Pattern_Prefix_Status
3782 regex_fixed_prefix(Const *patt_const, bool case_insensitive,
3783                                    Const **prefix_const, Const **rest_const)
3784 {
3785         char       *match;
3786         int                     pos,
3787                                 match_pos,
3788                                 prev_pos,
3789                                 prev_match_pos,
3790                                 paren_depth;
3791         char       *patt;
3792         char       *rest;
3793         Oid                     typeid = patt_const->consttype;
3794
3795         /*
3796          * Should be unnecessary, there are no bytea regex operators defined. As
3797          * such, it should be noted that the rest of this function has *not* been
3798          * made safe for binary (possibly NULL containing) strings.
3799          */
3800         if (typeid == BYTEAOID)
3801                 ereport(ERROR,
3802                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3803                  errmsg("regular-expression matching not supported on type bytea")));
3804
3805         /* the right-hand const is type text for all of these */
3806         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
3807
3808         /* Pattern must be anchored left */
3809         if (patt[0] != '^')
3810         {
3811                 rest = patt;
3812
3813                 *prefix_const = NULL;
3814                 *rest_const = string_to_const(rest, typeid);
3815
3816                 return Pattern_Prefix_None;
3817         }
3818
3819         /*
3820          * If unquoted | is present at paren level 0 in pattern, then there are
3821          * multiple alternatives for the start of the string.
3822          */
3823         paren_depth = 0;
3824         for (pos = 1; patt[pos]; pos++)
3825         {
3826                 if (patt[pos] == '|' && paren_depth == 0)
3827                 {
3828                         rest = patt;
3829
3830                         *prefix_const = NULL;
3831                         *rest_const = string_to_const(rest, typeid);
3832
3833                         return Pattern_Prefix_None;
3834                 }
3835                 else if (patt[pos] == '(')
3836                         paren_depth++;
3837                 else if (patt[pos] == ')' && paren_depth > 0)
3838                         paren_depth--;
3839                 else if (patt[pos] == '\\')
3840                 {
3841                         /* backslash quotes the next character */
3842                         pos++;
3843                         if (patt[pos] == '\0')
3844                                 break;
3845                 }
3846         }
3847
3848         /* OK, allocate space for pattern */
3849         match = palloc(strlen(patt) + 1);
3850         prev_match_pos = match_pos = 0;
3851
3852         /* note start at pos 1 to skip leading ^ */
3853         for (prev_pos = pos = 1; patt[pos];)
3854         {
3855                 int                     len;
3856
3857                 /*
3858                  * Check for characters that indicate multiple possible matches here.
3859                  * XXX I suspect isalpha() is not an adequately locale-sensitive test
3860                  * for characters that can vary under case folding?
3861                  */
3862                 if (patt[pos] == '.' ||
3863                         patt[pos] == '(' ||
3864                         patt[pos] == '[' ||
3865                         patt[pos] == '$' ||
3866                         (case_insensitive && isalpha((unsigned char) patt[pos])))
3867                         break;
3868
3869                 /*
3870                  * In AREs, backslash followed by alphanumeric is an escape, not a
3871                  * quoted character.  Must treat it as having multiple possible
3872                  * matches.
3873                  */
3874                 if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
3875                         break;
3876
3877                 /*
3878                  * Check for quantifiers.  Except for +, this means the preceding
3879                  * character is optional, so we must remove it from the prefix too!
3880                  */
3881                 if (patt[pos] == '*' ||
3882                         patt[pos] == '?' ||
3883                         patt[pos] == '{')
3884                 {
3885                         match_pos = prev_match_pos;
3886                         pos = prev_pos;
3887                         break;
3888                 }
3889                 if (patt[pos] == '+')
3890                 {
3891                         pos = prev_pos;
3892                         break;
3893                 }
3894                 if (patt[pos] == '\\')
3895                 {
3896                         /* backslash quotes the next character */
3897                         pos++;
3898                         if (patt[pos] == '\0')
3899                                 break;
3900                 }
3901                 /* save position in case we need to back up on next loop cycle */
3902                 prev_match_pos = match_pos;
3903                 prev_pos = pos;
3904                 /* must use encoding-aware processing here */
3905                 len = pg_mblen(&patt[pos]);
3906                 memcpy(&match[match_pos], &patt[pos], len);
3907                 match_pos += len;
3908                 pos += len;
3909         }
3910
3911         match[match_pos] = '\0';
3912         rest = &patt[pos];
3913
3914         if (patt[pos] == '$' && patt[pos + 1] == '\0')
3915         {
3916                 rest = &patt[pos + 1];
3917
3918                 *prefix_const = string_to_const(match, typeid);
3919                 *rest_const = string_to_const(rest, typeid);
3920
3921                 pfree(patt);
3922                 pfree(match);
3923
3924                 return Pattern_Prefix_Exact;    /* pattern specifies exact match */
3925         }
3926
3927         *prefix_const = string_to_const(match, typeid);
3928         *rest_const = string_to_const(rest, typeid);
3929
3930         pfree(patt);
3931         pfree(match);
3932
3933         if (match_pos > 0)
3934                 return Pattern_Prefix_Partial;
3935
3936         return Pattern_Prefix_None;
3937 }
3938
3939 Pattern_Prefix_Status
3940 pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
3941                                          Const **prefix, Const **rest)
3942 {
3943         Pattern_Prefix_Status result;
3944
3945         switch (ptype)
3946         {
3947                 case Pattern_Type_Like:
3948                         result = like_fixed_prefix(patt, false, prefix, rest);
3949                         break;
3950                 case Pattern_Type_Like_IC:
3951                         result = like_fixed_prefix(patt, true, prefix, rest);
3952                         break;
3953                 case Pattern_Type_Regex:
3954                         result = regex_fixed_prefix(patt, false, prefix, rest);
3955                         break;
3956                 case Pattern_Type_Regex_IC:
3957                         result = regex_fixed_prefix(patt, true, prefix, rest);
3958                         break;
3959                 default:
3960                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
3961                         result = Pattern_Prefix_None;           /* keep compiler quiet */
3962                         break;
3963         }
3964         return result;
3965 }
3966
3967 /*
3968  * Estimate the selectivity of a fixed prefix for a pattern match.
3969  *
3970  * A fixed prefix "foo" is estimated as the selectivity of the expression
3971  * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
3972  *
3973  * The selectivity estimate is with respect to the portion of the column
3974  * population represented by the histogram --- the caller must fold this
3975  * together with info about MCVs and NULLs.
3976  *
3977  * We use the >= and < operators from the specified btree opclass to do the
3978  * estimation.  The given variable and Const must be of the associated
3979  * datatype.
3980  *
3981  * XXX Note: we make use of the upper bound to estimate operator selectivity
3982  * even if the locale is such that we cannot rely on the upper-bound string.
3983  * The selectivity only needs to be approximately right anyway, so it seems
3984  * more useful to use the upper-bound code than not.
3985  */
3986 static Selectivity
3987 prefix_selectivity(VariableStatData *vardata, Oid opclass, Const *prefixcon)
3988 {
3989         Selectivity prefixsel;
3990         Oid                     cmpopr;
3991         FmgrInfo        opproc;
3992         Const      *greaterstrcon;
3993
3994         cmpopr = get_opclass_member(opclass, InvalidOid,
3995                                                                 BTGreaterEqualStrategyNumber);
3996         if (cmpopr == InvalidOid)
3997                 elog(ERROR, "no >= operator for opclass %u", opclass);
3998         fmgr_info(get_opcode(cmpopr), &opproc);
3999
4000         prefixsel = ineq_histogram_selectivity(vardata, &opproc, true,
4001                                                                                    prefixcon->constvalue,
4002                                                                                    prefixcon->consttype);
4003
4004         if (prefixsel <= 0.0)
4005         {
4006                 /* No histogram is present ... return a suitable default estimate */
4007                 return 0.005;
4008         }
4009
4010         /*-------
4011          * If we can create a string larger than the prefix, say
4012          *      "x < greaterstr".
4013          *-------
4014          */
4015         greaterstrcon = make_greater_string(prefixcon);
4016         if (greaterstrcon)
4017         {
4018                 Selectivity topsel;
4019
4020                 cmpopr = get_opclass_member(opclass, InvalidOid,
4021                                                                         BTLessStrategyNumber);
4022                 if (cmpopr == InvalidOid)
4023                         elog(ERROR, "no < operator for opclass %u", opclass);
4024                 fmgr_info(get_opcode(cmpopr), &opproc);
4025
4026                 topsel = ineq_histogram_selectivity(vardata, &opproc, false,
4027                                                                                         greaterstrcon->constvalue,
4028                                                                                         greaterstrcon->consttype);
4029
4030                 /* ineq_histogram_selectivity worked before, it shouldn't fail now */
4031                 Assert(topsel > 0.0);
4032
4033                 /*
4034                  * Merge the two selectivities in the same way as for a range query
4035                  * (see clauselist_selectivity()).  Note that we don't need to worry
4036                  * about double-exclusion of nulls, since ineq_histogram_selectivity
4037                  * doesn't count those anyway.
4038                  */
4039                 prefixsel = topsel + prefixsel - 1.0;
4040
4041                 /*
4042                  * A zero or negative prefixsel should be converted into a small
4043                  * positive value; we probably are dealing with a very tight range
4044                  * and got a bogus result due to roundoff errors.
4045                  */
4046                 if (prefixsel <= 0.0)
4047                         prefixsel = 1.0e-10;
4048         }
4049
4050         return prefixsel;
4051 }
4052
4053
4054 /*
4055  * Estimate the selectivity of a pattern of the specified type.
4056  * Note that any fixed prefix of the pattern will have been removed already.
4057  *
4058  * For now, we use a very simplistic approach: fixed characters reduce the
4059  * selectivity a good deal, character ranges reduce it a little,
4060  * wildcards (such as % for LIKE or .* for regex) increase it.
4061  */
4062
4063 #define FIXED_CHAR_SEL  0.20    /* about 1/5 */
4064 #define CHAR_RANGE_SEL  0.25
4065 #define ANY_CHAR_SEL    0.9             /* not 1, since it won't match end-of-string */
4066 #define FULL_WILDCARD_SEL 5.0
4067 #define PARTIAL_WILDCARD_SEL 2.0
4068
4069 static Selectivity
4070 like_selectivity(Const *patt_const, bool case_insensitive)
4071 {
4072         Selectivity sel = 1.0;
4073         int                     pos;
4074         Oid                     typeid = patt_const->consttype;
4075         char       *patt;
4076         int                     pattlen;
4077
4078         /* the right-hand const is type text or bytea */
4079         Assert(typeid == BYTEAOID || typeid == TEXTOID);
4080
4081         if (typeid == BYTEAOID && case_insensitive)
4082                 ereport(ERROR,
4083                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4084                    errmsg("case insensitive matching not supported on type bytea")));
4085
4086         if (typeid != BYTEAOID)
4087         {
4088                 patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4089                 pattlen = strlen(patt);
4090         }
4091         else
4092         {
4093                 bytea      *bstr = DatumGetByteaP(patt_const->constvalue);
4094
4095                 pattlen = VARSIZE(bstr) - VARHDRSZ;
4096                 patt = (char *) palloc(pattlen);
4097                 memcpy(patt, VARDATA(bstr), pattlen);
4098                 if ((Pointer) bstr != DatumGetPointer(patt_const->constvalue))
4099                         pfree(bstr);
4100         }
4101
4102         /* Skip any leading wildcard; it's already factored into initial sel */
4103         for (pos = 0; pos < pattlen; pos++)
4104         {
4105                 if (patt[pos] != '%' && patt[pos] != '_')
4106                         break;
4107         }
4108
4109         for (; pos < pattlen; pos++)
4110         {
4111                 /* % and _ are wildcard characters in LIKE */
4112                 if (patt[pos] == '%')
4113                         sel *= FULL_WILDCARD_SEL;
4114                 else if (patt[pos] == '_')
4115                         sel *= ANY_CHAR_SEL;
4116                 else if (patt[pos] == '\\')
4117                 {
4118                         /* Backslash quotes the next character */
4119                         pos++;
4120                         if (pos >= pattlen)
4121                                 break;
4122                         sel *= FIXED_CHAR_SEL;
4123                 }
4124                 else
4125                         sel *= FIXED_CHAR_SEL;
4126         }
4127         /* Could get sel > 1 if multiple wildcards */
4128         if (sel > 1.0)
4129                 sel = 1.0;
4130
4131         pfree(patt);
4132         return sel;
4133 }
4134
4135 static Selectivity
4136 regex_selectivity_sub(char *patt, int pattlen, bool case_insensitive)
4137 {
4138         Selectivity sel = 1.0;
4139         int                     paren_depth = 0;
4140         int                     paren_pos = 0;  /* dummy init to keep compiler quiet */
4141         int                     pos;
4142
4143         for (pos = 0; pos < pattlen; pos++)
4144         {
4145                 if (patt[pos] == '(')
4146                 {
4147                         if (paren_depth == 0)
4148                                 paren_pos = pos;        /* remember start of parenthesized item */
4149                         paren_depth++;
4150                 }
4151                 else if (patt[pos] == ')' && paren_depth > 0)
4152                 {
4153                         paren_depth--;
4154                         if (paren_depth == 0)
4155                                 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
4156                                                                                          pos - (paren_pos + 1),
4157                                                                                          case_insensitive);
4158                 }
4159                 else if (patt[pos] == '|' && paren_depth == 0)
4160                 {
4161                         /*
4162                          * If unquoted | is present at paren level 0 in pattern, we have
4163                          * multiple alternatives; sum their probabilities.
4164                          */
4165                         sel += regex_selectivity_sub(patt + (pos + 1),
4166                                                                                  pattlen - (pos + 1),
4167                                                                                  case_insensitive);
4168                         break;                          /* rest of pattern is now processed */
4169                 }
4170                 else if (patt[pos] == '[')
4171                 {
4172                         bool            negclass = false;
4173
4174                         if (patt[++pos] == '^')
4175                         {
4176                                 negclass = true;
4177                                 pos++;
4178                         }
4179                         if (patt[pos] == ']')           /* ']' at start of class is not
4180                                                                                  * special */
4181                                 pos++;
4182                         while (pos < pattlen && patt[pos] != ']')
4183                                 pos++;
4184                         if (paren_depth == 0)
4185                                 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
4186                 }
4187                 else if (patt[pos] == '.')
4188                 {
4189                         if (paren_depth == 0)
4190                                 sel *= ANY_CHAR_SEL;
4191                 }
4192                 else if (patt[pos] == '*' ||
4193                                  patt[pos] == '?' ||
4194                                  patt[pos] == '+')
4195                 {
4196                         /* Ought to be smarter about quantifiers... */
4197                         if (paren_depth == 0)
4198                                 sel *= PARTIAL_WILDCARD_SEL;
4199                 }
4200                 else if (patt[pos] == '{')
4201                 {
4202                         while (pos < pattlen && patt[pos] != '}')
4203                                 pos++;
4204                         if (paren_depth == 0)
4205                                 sel *= PARTIAL_WILDCARD_SEL;
4206                 }
4207                 else if (patt[pos] == '\\')
4208                 {
4209                         /* backslash quotes the next character */
4210                         pos++;
4211                         if (pos >= pattlen)
4212                                 break;
4213                         if (paren_depth == 0)
4214                                 sel *= FIXED_CHAR_SEL;
4215                 }
4216                 else
4217                 {
4218                         if (paren_depth == 0)
4219                                 sel *= FIXED_CHAR_SEL;
4220                 }
4221         }
4222         /* Could get sel > 1 if multiple wildcards */
4223         if (sel > 1.0)
4224                 sel = 1.0;
4225         return sel;
4226 }
4227
4228 static Selectivity
4229 regex_selectivity(Const *patt_const, bool case_insensitive)
4230 {
4231         Selectivity sel;
4232         char       *patt;
4233         int                     pattlen;
4234         Oid                     typeid = patt_const->consttype;
4235
4236         /*
4237          * Should be unnecessary, there are no bytea regex operators defined. As
4238          * such, it should be noted that the rest of this function has *not* been
4239          * made safe for binary (possibly NULL containing) strings.
4240          */
4241         if (typeid == BYTEAOID)
4242                 ereport(ERROR,
4243                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4244                  errmsg("regular-expression matching not supported on type bytea")));
4245
4246         /* the right-hand const is type text for all of these */
4247         patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
4248         pattlen = strlen(patt);
4249
4250         /* If patt doesn't end with $, consider it to have a trailing wildcard */
4251         if (pattlen > 0 && patt[pattlen - 1] == '$' &&
4252                 (pattlen == 1 || patt[pattlen - 2] != '\\'))
4253         {
4254                 /* has trailing $ */
4255                 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
4256         }
4257         else
4258         {
4259                 /* no trailing $ */
4260                 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
4261                 sel *= FULL_WILDCARD_SEL;
4262                 if (sel > 1.0)
4263                         sel = 1.0;
4264         }
4265         return sel;
4266 }
4267
4268 static Selectivity
4269 pattern_selectivity(Const *patt, Pattern_Type ptype)
4270 {
4271         Selectivity result;
4272
4273         switch (ptype)
4274         {
4275                 case Pattern_Type_Like:
4276                         result = like_selectivity(patt, false);
4277                         break;
4278                 case Pattern_Type_Like_IC:
4279                         result = like_selectivity(patt, true);
4280                         break;
4281                 case Pattern_Type_Regex:
4282                         result = regex_selectivity(patt, false);
4283                         break;
4284                 case Pattern_Type_Regex_IC:
4285                         result = regex_selectivity(patt, true);
4286                         break;
4287                 default:
4288                         elog(ERROR, "unrecognized ptype: %d", (int) ptype);
4289                         result = 1.0;           /* keep compiler quiet */
4290                         break;
4291         }
4292         return result;
4293 }
4294
4295
4296 /*
4297  * Try to generate a string greater than the given string or any
4298  * string it is a prefix of.  If successful, return a palloc'd string
4299  * in the form of a Const pointer; else return NULL.
4300  *
4301  * The key requirement here is that given a prefix string, say "foo",
4302  * we must be able to generate another string "fop" that is greater
4303  * than all strings "foobar" starting with "foo".
4304  *
4305  * If we max out the righthand byte, truncate off the last character
4306  * and start incrementing the next.  For example, if "z" were the last
4307  * character in the sort order, then we could produce "foo" as a
4308  * string greater than "fonz".
4309  *
4310  * This could be rather slow in the worst case, but in most cases we
4311  * won't have to try more than one or two strings before succeeding.
4312  *
4313  * NOTE: at present this assumes we are in the C locale, so that simple
4314  * bytewise comparison applies.  However, we might be in a multibyte
4315  * encoding such as UTF8, so we do have to watch out for generating
4316  * invalid encoding sequences.
4317  */
4318 Const *
4319 make_greater_string(const Const *str_const)
4320 {
4321         Oid                     datatype = str_const->consttype;
4322         char       *workstr;
4323         int                     len;
4324
4325         /* Get the string and a modifiable copy */
4326         if (datatype == NAMEOID)
4327         {
4328                 workstr = DatumGetCString(DirectFunctionCall1(nameout,
4329                                                                                                           str_const->constvalue));
4330                 len = strlen(workstr);
4331         }
4332         else if (datatype == BYTEAOID)
4333         {
4334                 bytea      *bstr = DatumGetByteaP(str_const->constvalue);
4335
4336                 len = VARSIZE(bstr) - VARHDRSZ;
4337                 workstr = (char *) palloc(len);
4338                 memcpy(workstr, VARDATA(bstr), len);
4339                 if ((Pointer) bstr != DatumGetPointer(str_const->constvalue))
4340                         pfree(bstr);
4341         }
4342         else
4343         {
4344                 workstr = DatumGetCString(DirectFunctionCall1(textout,
4345                                                                                                           str_const->constvalue));
4346                 len = strlen(workstr);
4347         }
4348
4349         while (len > 0)
4350         {
4351                 unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
4352                 unsigned char savelastchar = *lastchar;
4353
4354                 /*
4355                  * Try to generate a larger string by incrementing the last byte.
4356                  */
4357                 while (*lastchar < (unsigned char) 255)
4358                 {
4359                         Const      *workstr_const;
4360
4361                         (*lastchar)++;
4362
4363                         if (datatype != BYTEAOID)
4364                         {
4365                                 /* do not generate invalid encoding sequences */
4366                                 if (!pg_verifymbstr(workstr, len, true))
4367                                         continue;
4368                                 workstr_const = string_to_const(workstr, datatype);
4369                         }
4370                         else
4371                                 workstr_const = string_to_bytea_const(workstr, len);
4372
4373                         pfree(workstr);
4374                         return workstr_const;
4375                 }
4376
4377                 /* restore last byte so we don't confuse pg_mbcliplen */
4378                 *lastchar = savelastchar;
4379
4380                 /*
4381                  * Truncate off the last character, which might be more than 1 byte,
4382                  * depending on the character encoding.
4383                  */
4384                 if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
4385                         len = pg_mbcliplen(workstr, len, len - 1);
4386                 else
4387                         len -= 1;
4388
4389                 if (datatype != BYTEAOID)
4390                         workstr[len] = '\0';
4391         }
4392
4393         /* Failed... */
4394         pfree(workstr);
4395
4396         return NULL;
4397 }
4398
4399 /*
4400  * Generate a Datum of the appropriate type from a C string.
4401  * Note that all of the supported types are pass-by-ref, so the
4402  * returned value should be pfree'd if no longer needed.
4403  */
4404 static Datum
4405 string_to_datum(const char *str, Oid datatype)
4406 {
4407         Assert(str != NULL);
4408
4409         /*
4410          * We cheat a little by assuming that textin() will do for bpchar and
4411          * varchar constants too...
4412          */
4413         if (datatype == NAMEOID)
4414                 return DirectFunctionCall1(namein, CStringGetDatum(str));
4415         else if (datatype == BYTEAOID)
4416                 return DirectFunctionCall1(byteain, CStringGetDatum(str));
4417         else
4418                 return DirectFunctionCall1(textin, CStringGetDatum(str));
4419 }
4420
4421 /*
4422  * Generate a Const node of the appropriate type from a C string.
4423  */
4424 static Const *
4425 string_to_const(const char *str, Oid datatype)
4426 {
4427         Datum           conval = string_to_datum(str, datatype);
4428
4429         return makeConst(datatype, ((datatype == NAMEOID) ? NAMEDATALEN : -1),
4430                                          conval, false, false);
4431 }
4432
4433 /*
4434  * Generate a Const node of bytea type from a binary C string and a length.
4435  */
4436 static Const *
4437 string_to_bytea_const(const char *str, size_t str_len)
4438 {
4439         bytea      *bstr = palloc(VARHDRSZ + str_len);
4440         Datum           conval;
4441
4442         memcpy(VARDATA(bstr), str, str_len);
4443         VARATT_SIZEP(bstr) = VARHDRSZ + str_len;
4444         conval = PointerGetDatum(bstr);
4445
4446         return makeConst(BYTEAOID, -1, conval, false, false);
4447 }
4448
4449 /*-------------------------------------------------------------------------
4450  *
4451  * Index cost estimation functions
4452  *
4453  * genericcostestimate is a general-purpose estimator for use when we
4454  * don't have any better idea about how to estimate.  Index-type-specific
4455  * knowledge can be incorporated in the type-specific routines.
4456  *
4457  * One bit of index-type-specific knowledge we can relatively easily use
4458  * in genericcostestimate is the estimate of the number of index tuples
4459  * visited.  If numIndexTuples is not 0 then it is used as the estimate,
4460  * otherwise we compute a generic estimate.
4461  *
4462  *-------------------------------------------------------------------------
4463  */
4464
4465 static void
4466 genericcostestimate(PlannerInfo *root,
4467                                         IndexOptInfo *index, List *indexQuals,
4468                                         double numIndexTuples,
4469                                         Cost *indexStartupCost,
4470                                         Cost *indexTotalCost,
4471                                         Selectivity *indexSelectivity,
4472                                         double *indexCorrelation)
4473 {
4474         double          numIndexPages;
4475         QualCost        index_qual_cost;
4476         double          qual_op_cost;
4477         double          qual_arg_cost;
4478         List       *selectivityQuals;
4479
4480         /*
4481          * If the index is partial, AND the index predicate with the explicitly
4482          * given indexquals to produce a more accurate idea of the index
4483          * selectivity.  This may produce redundant clauses.  We get rid of exact
4484          * duplicates in the code below.  We expect that most cases of partial
4485          * redundancy (such as "x < 4" from the qual and "x < 5" from the
4486          * predicate) will be recognized and handled correctly by
4487          * clauselist_selectivity().  This assumption is somewhat fragile, since
4488          * it depends on predicate_implied_by() and clauselist_selectivity()
4489          * having similar capabilities, and there are certainly many cases where
4490          * we will end up with a too-low selectivity estimate.  This will bias the
4491          * system in favor of using partial indexes where possible, which is not
4492          * necessarily a bad thing. But it'd be nice to do better someday.
4493          *
4494          * Note that index->indpred and indexQuals are both in implicit-AND form,
4495          * so ANDing them together just takes merging the lists.  However,
4496          * eliminating duplicates is a bit trickier because indexQuals contains
4497          * RestrictInfo nodes and the indpred does not.  It is okay to pass a
4498          * mixed list to clauselist_selectivity, but we have to work a bit to
4499          * generate a list without logical duplicates.  (We could just list_union
4500          * indpred and strippedQuals, but then we'd not get caching of per-qual
4501          * selectivity estimates.)
4502          */
4503         if (index->indpred != NIL)
4504         {
4505                 List       *strippedQuals;
4506                 List       *predExtraQuals;
4507
4508                 strippedQuals = get_actual_clauses(indexQuals);
4509                 predExtraQuals = list_difference(index->indpred, strippedQuals);
4510                 selectivityQuals = list_concat(predExtraQuals, indexQuals);
4511         }
4512         else
4513                 selectivityQuals = indexQuals;
4514
4515         /* Estimate the fraction of main-table tuples that will be visited */
4516         *indexSelectivity = clauselist_selectivity(root, selectivityQuals,
4517                                                                                            index->rel->relid,
4518                                                                                            JOIN_INNER);
4519
4520         /*
4521          * If caller didn't give us an estimate, estimate the number of index
4522          * tuples that will be visited.  We do it in this rather peculiar-looking
4523          * way in order to get the right answer for partial indexes.
4524          */
4525         if (numIndexTuples <= 0.0)
4526                 numIndexTuples = *indexSelectivity * index->rel->tuples;
4527
4528         /*
4529          * We can bound the number of tuples by the index size in any case. Also,
4530          * always estimate at least one tuple is touched, even when
4531          * indexSelectivity estimate is tiny.
4532          */
4533         if (numIndexTuples > index->tuples)
4534                 numIndexTuples = index->tuples;
4535         if (numIndexTuples < 1.0)
4536                 numIndexTuples = 1.0;
4537
4538         /*
4539          * Estimate the number of index pages that will be retrieved.
4540          *
4541          * For all currently-supported index types, the first page of the index is
4542          * a metadata page, and we should figure on fetching that plus a pro-rated
4543          * fraction of the remaining pages.
4544          */
4545         if (index->pages > 1 && index->tuples > 0)
4546         {
4547                 numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
4548                 numIndexPages += 1;             /* count the metapage too */
4549                 numIndexPages = ceil(numIndexPages);
4550         }
4551         else
4552                 numIndexPages = 1.0;
4553
4554         /*
4555          * Compute the index access cost.
4556          *
4557          * Disk cost: our generic assumption is that the index pages will be read
4558          * sequentially, so they have cost 1.0 each, not random_page_cost.
4559          */
4560         *indexTotalCost = numIndexPages;
4561
4562         /*
4563          * CPU cost: any complex expressions in the indexquals will need to be
4564          * evaluated once at the start of the scan to reduce them to runtime keys
4565          * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
4566          * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
4567          * indexqual operator.
4568          *
4569          * Note: this neglects the possible costs of rechecking lossy operators
4570          * and OR-clause expressions.  Detecting that that might be needed seems
4571          * more expensive than it's worth, though, considering all the other
4572          * inaccuracies here ...
4573          */
4574         cost_qual_eval(&index_qual_cost, indexQuals);
4575         qual_op_cost = cpu_operator_cost * list_length(indexQuals);
4576         qual_arg_cost = index_qual_cost.startup +
4577                 index_qual_cost.per_tuple - qual_op_cost;
4578         if (qual_arg_cost < 0)          /* just in case... */
4579                 qual_arg_cost = 0;
4580         *indexStartupCost = qual_arg_cost;
4581         *indexTotalCost += qual_arg_cost;
4582         *indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost);
4583
4584         /*
4585          * Generic assumption about index correlation: there isn't any.
4586          */
4587         *indexCorrelation = 0.0;
4588 }
4589
4590
4591 Datum
4592 btcostestimate(PG_FUNCTION_ARGS)
4593 {
4594         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4595         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4596         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4597         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4598         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4599         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4600         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4601         Oid                     relid;
4602         AttrNumber      colnum;
4603         HeapTuple       tuple;
4604         double          numIndexTuples;
4605         List       *indexBoundQuals;
4606         int                     indexcol;
4607         bool            eqQualHere;
4608         bool            found_saop;
4609         ListCell   *l;
4610
4611         /*
4612          * For a btree scan, only leading '=' quals plus inequality quals for the
4613          * immediately next attribute contribute to index selectivity (these are
4614          * the "boundary quals" that determine the starting and stopping points of
4615          * the index scan).  Additional quals can suppress visits to the heap, so
4616          * it's OK to count them in indexSelectivity, but they should not count
4617          * for estimating numIndexTuples.  So we must examine the given indexQuals
4618          * to find out which ones count as boundary quals.      We rely on the
4619          * knowledge that they are given in index column order.
4620          *
4621          * For a RowCompareExpr, we consider only the first column, just as
4622          * rowcomparesel() does.
4623          *
4624          * If there's a ScalarArrayOpExpr in the quals, we'll actually perform
4625          * N index scans not one, but the ScalarArrayOpExpr's operator can be
4626          * considered to act the same as it normally does.
4627          */
4628         indexBoundQuals = NIL;
4629         indexcol = 0;
4630         eqQualHere = false;
4631         found_saop = false;
4632         foreach(l, indexQuals)
4633         {
4634                 RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
4635                 Expr       *clause;
4636                 Node       *leftop,
4637                                    *rightop;
4638                 Oid                     clause_op;
4639                 int                     op_strategy;
4640
4641                 Assert(IsA(rinfo, RestrictInfo));
4642                 clause = rinfo->clause;
4643                 if (IsA(clause, OpExpr))
4644                 {
4645                         leftop = get_leftop(clause);
4646                         rightop = get_rightop(clause);
4647                         clause_op = ((OpExpr *) clause)->opno;
4648                 }
4649                 else if (IsA(clause, RowCompareExpr))
4650                 {
4651                         RowCompareExpr *rc = (RowCompareExpr *) clause;
4652
4653                         leftop = (Node *) linitial(rc->largs);
4654                         rightop = (Node *) linitial(rc->rargs);
4655                         clause_op = linitial_oid(rc->opnos);
4656                 }
4657                 else if (IsA(clause, ScalarArrayOpExpr))
4658                 {
4659                         ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
4660
4661                         leftop = (Node *) linitial(saop->args);
4662                         rightop = (Node *) lsecond(saop->args);
4663                         clause_op = saop->opno;
4664                         found_saop = true;
4665                 }
4666                 else
4667                 {
4668                         elog(ERROR, "unsupported indexqual type: %d",
4669                                  (int) nodeTag(clause));
4670                         continue;                       /* keep compiler quiet */
4671                 }
4672                 if (match_index_to_operand(leftop, indexcol, index))
4673                 {
4674                         /* clause_op is correct */
4675                 }
4676                 else if (match_index_to_operand(rightop, indexcol, index))
4677                 {
4678                         /* Must flip operator to get the opclass member */
4679                         clause_op = get_commutator(clause_op);
4680                 }
4681                 else
4682                 {
4683                         /* Must be past the end of quals for indexcol, try next */
4684                         if (!eqQualHere)
4685                                 break;                  /* done if no '=' qual for indexcol */
4686                         indexcol++;
4687                         eqQualHere = false;
4688                         if (match_index_to_operand(leftop, indexcol, index))
4689                         {
4690                                 /* clause_op is correct */
4691                         }
4692                         else if (match_index_to_operand(rightop, indexcol, index))
4693                         {
4694                                 /* Must flip operator to get the opclass member */
4695                                 clause_op = get_commutator(clause_op);
4696                         }
4697                         else
4698                         {
4699                                 /* No quals for new indexcol, so we are done */
4700                                 break;
4701                         }
4702                 }
4703                 op_strategy = get_op_opclass_strategy(clause_op,
4704                                                                                           index->classlist[indexcol]);
4705                 Assert(op_strategy != 0);               /* not a member of opclass?? */
4706                 if (op_strategy == BTEqualStrategyNumber)
4707                         eqQualHere = true;
4708                 indexBoundQuals = lappend(indexBoundQuals, rinfo);
4709         }
4710
4711         /*
4712          * If index is unique and we found an '=' clause for each column, we can
4713          * just assume numIndexTuples = 1 and skip the expensive
4714          * clauselist_selectivity calculations.
4715          */
4716         if (index->unique &&
4717                 indexcol == index->ncolumns - 1 &&
4718                 eqQualHere &&
4719                 !found_saop)
4720                 numIndexTuples = 1.0;
4721         else
4722         {
4723                 Selectivity btreeSelectivity;
4724
4725                 btreeSelectivity = clauselist_selectivity(root, indexBoundQuals,
4726                                                                                                   index->rel->relid,
4727                                                                                                   JOIN_INNER);
4728                 numIndexTuples = btreeSelectivity * index->rel->tuples;
4729         }
4730
4731         genericcostestimate(root, index, indexQuals, numIndexTuples,
4732                                                 indexStartupCost, indexTotalCost,
4733                                                 indexSelectivity, indexCorrelation);
4734
4735         /*
4736          * If we can get an estimate of the first column's ordering correlation C
4737          * from pg_statistic, estimate the index correlation as C for a
4738          * single-column index, or C * 0.75 for multiple columns. (The idea here
4739          * is that multiple columns dilute the importance of the first column's
4740          * ordering, but don't negate it entirely.  Before 8.0 we divided the
4741          * correlation by the number of columns, but that seems too strong.)
4742          *
4743          * We can skip all this if we found a ScalarArrayOpExpr, because then
4744          * the call must be for a bitmap index scan, and the caller isn't going
4745          * to care what the index correlation is.
4746          */
4747         if (found_saop)
4748                 PG_RETURN_VOID();
4749
4750         if (index->indexkeys[0] != 0)
4751         {
4752                 /* Simple variable --- look to stats for the underlying table */
4753                 relid = getrelid(index->rel->relid, root->parse->rtable);
4754                 Assert(relid != InvalidOid);
4755                 colnum = index->indexkeys[0];
4756         }
4757         else
4758         {
4759                 /* Expression --- maybe there are stats for the index itself */
4760                 relid = index->indexoid;
4761                 colnum = 1;
4762         }
4763
4764         tuple = SearchSysCache(STATRELATT,
4765                                                    ObjectIdGetDatum(relid),
4766                                                    Int16GetDatum(colnum),
4767                                                    0, 0);
4768
4769         if (HeapTupleIsValid(tuple))
4770         {
4771                 float4     *numbers;
4772                 int                     nnumbers;
4773
4774                 if (get_attstatsslot(tuple, InvalidOid, 0,
4775                                                          STATISTIC_KIND_CORRELATION,
4776                                                          index->ordering[0],
4777                                                          NULL, NULL, &numbers, &nnumbers))
4778                 {
4779                         double          varCorrelation;
4780
4781                         Assert(nnumbers == 1);
4782                         varCorrelation = numbers[0];
4783
4784                         if (index->ncolumns > 1)
4785                                 *indexCorrelation = varCorrelation * 0.75;
4786                         else
4787                                 *indexCorrelation = varCorrelation;
4788
4789                         free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
4790                 }
4791                 ReleaseSysCache(tuple);
4792         }
4793
4794         PG_RETURN_VOID();
4795 }
4796
4797 Datum
4798 hashcostestimate(PG_FUNCTION_ARGS)
4799 {
4800         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4801         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4802         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4803         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4804         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4805         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4806         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4807
4808         genericcostestimate(root, index, indexQuals, 0.0,
4809                                                 indexStartupCost, indexTotalCost,
4810                                                 indexSelectivity, indexCorrelation);
4811
4812         PG_RETURN_VOID();
4813 }
4814
4815 Datum
4816 gistcostestimate(PG_FUNCTION_ARGS)
4817 {
4818         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
4819         IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
4820         List       *indexQuals = (List *) PG_GETARG_POINTER(2);
4821         Cost       *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
4822         Cost       *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
4823         Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
4824         double     *indexCorrelation = (double *) PG_GETARG_POINTER(6);
4825
4826         genericcostestimate(root, index, indexQuals, 0.0,
4827                                                 indexStartupCost, indexTotalCost,
4828                                                 indexSelectivity, indexCorrelation);
4829
4830         PG_RETURN_VOID();
4831 }