granicus.if.org Git - postgresql/blob - src/backend/optimizer/path/costsize.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * costsize.c
   4  *        Routines to compute (and set) relation sizes and path costs
   5  *
   6  * Path costs are measured in units of disk accesses: one sequential page
   7  * fetch has cost 1.  All else is scaled relative to a page fetch, using
   8  * the scaling parameters
   9  *
  10  *      random_page_cost        Cost of a non-sequential page fetch
  11  *      cpu_tuple_cost          Cost of typical CPU time to process a tuple
  12  *      cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
  13  *      cpu_operator_cost       Cost of CPU time to process a typical WHERE operator
  14  *
  15  * We also use a rough estimate "effective_cache_size" of the number of
  16  * disk pages in Postgres + OS-level disk cache.  (We can't simply use
  17  * NBuffers for this purpose because that would ignore the effects of
  18  * the kernel's disk cache.)
  19  *
  20  * Obviously, taking constants for these values is an oversimplification,
  21  * but it's tough enough to get any useful estimates even at this level of
  22  * detail.      Note that all of these parameters are user-settable, in case
  23  * the default values are drastically off for a particular platform.
  24  *
  25  * We compute two separate costs for each path:
  26  *              total_cost: total estimated cost to fetch all tuples
  27  *              startup_cost: cost that is expended before first tuple is fetched
  28  * In some scenarios, such as when there is a LIMIT or we are implementing
  29  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
  30  * path's result.  A caller can estimate the cost of fetching a partial
  31  * result by interpolating between startup_cost and total_cost.  In detail:
  32  *              actual_cost = startup_cost +
  33  *                      (total_cost - startup_cost) * tuples_to_fetch / path->parent->rows;
  34  * Note that a base relation's rows count (and, by extension, plan_rows for
  35  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
  36  * that this equation works properly.  (Also, these routines guarantee not to
  37  * set the rows count to zero, so there will be no zero divide.)  The LIMIT is
  38  * applied as a top-level plan node.
  39  *
  40  *
  41  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  42  * Portions Copyright (c) 1994, Regents of the University of California
  43  *
  44  * IDENTIFICATION
  45  *        $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.93 2002/11/30 05:21:02 tgl Exp $
  46  *
  47  *-------------------------------------------------------------------------
  48  */
  49
  50 #include "postgres.h"
  51
  52 #include <math.h>
  53
  54 #include "catalog/pg_statistic.h"
  55 #include "executor/nodeHash.h"
  56 #include "miscadmin.h"
  57 #include "optimizer/clauses.h"
  58 #include "optimizer/cost.h"
  59 #include "optimizer/pathnode.h"
  60 #include "parser/parsetree.h"
  61 #include "utils/selfuncs.h"
  62 #include "utils/lsyscache.h"
  63 #include "utils/syscache.h"
  64
  65
  66 #define LOG2(x)  (log(x) / 0.693147180559945)
  67 #define LOG6(x)  (log(x) / 1.79175946922805)
  68
  69
  70 double          effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
  71 double          random_page_cost = DEFAULT_RANDOM_PAGE_COST;
  72 double          cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
  73 double          cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
  74 double          cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
  75
  76 Cost            disable_cost = 100000000.0;
  77
  78 bool            enable_seqscan = true;
  79 bool            enable_indexscan = true;
  80 bool            enable_tidscan = true;
  81 bool            enable_sort = true;
  82 bool            enable_hashagg = true;
  83 bool            enable_nestloop = true;
  84 bool            enable_mergejoin = true;
  85 bool            enable_hashjoin = true;
  86
  87
  88 static Selectivity estimate_hash_bucketsize(Query *root, Var *var);
  89 static bool cost_qual_eval_walker(Node *node, Cost *total);
  90 static Selectivity approx_selectivity(Query *root, List *quals);
  91 static void set_rel_width(Query *root, RelOptInfo *rel);
  92 static double relation_byte_size(double tuples, int width);
  93 static double page_size(double tuples, int width);
  94
  95
  96 /*
  97  * cost_seqscan
  98  *        Determines and returns the cost of scanning a relation sequentially.
  99  *
 100  * Note: for historical reasons, this routine and the others in this module
 101  * use the passed result Path only to store their startup_cost and total_cost
 102  * results into.  All the input data they need is passed as separate
 103  * parameters, even though much of it could be extracted from the Path.
 104  */
 105 void
 106 cost_seqscan(Path *path, Query *root,
 107                          RelOptInfo *baserel)
 108 {
 109         Cost            startup_cost = 0;
 110         Cost            run_cost = 0;
 111         Cost            cpu_per_tuple;
 112
 113         /* Should only be applied to base relations */
 114         Assert(length(baserel->relids) == 1);
 115         Assert(baserel->rtekind == RTE_RELATION);
 116
 117         if (!enable_seqscan)
 118                 startup_cost += disable_cost;
 119
 120         /*
 121          * disk costs
 122          *
 123          * The cost of reading a page sequentially is 1.0, by definition. Note
 124          * that the Unix kernel will typically do some amount of read-ahead
 125          * optimization, so that this cost is less than the true cost of
 126          * reading a page from disk.  We ignore that issue here, but must take
 127          * it into account when estimating the cost of non-sequential
 128          * accesses!
 129          */
 130         run_cost += baserel->pages; /* sequential fetches with cost 1.0 */
 131
 132         /* CPU costs */
 133         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 134         run_cost += cpu_per_tuple * baserel->tuples;
 135
 136         path->startup_cost = startup_cost;
 137         path->total_cost = startup_cost + run_cost;
 138 }
 139
 140 /*
 141  * cost_nonsequential_access
 142  *        Estimate the cost of accessing one page at random from a relation
 143  *        (or sort temp file) of the given size in pages.
 144  *
 145  * The simplistic model that the cost is random_page_cost is what we want
 146  * to use for large relations; but for small ones that is a serious
 147  * overestimate because of the effects of caching.      This routine tries to
 148  * account for that.
 149  *
 150  * Unfortunately we don't have any good way of estimating the effective cache
 151  * size we are working with --- we know that Postgres itself has NBuffers
 152  * internal buffers, but the size of the kernel's disk cache is uncertain,
 153  * and how much of it we get to use is even less certain.  We punt the problem
 154  * for now by assuming we are given an effective_cache_size parameter.
 155  *
 156  * Given a guesstimated cache size, we estimate the actual I/O cost per page
 157  * with the entirely ad-hoc equations:
 158  *      if relpages >= effective_cache_size:
 159  *              random_page_cost * (1 - (effective_cache_size/relpages)/2)
 160  *      if relpages < effective_cache_size:
 161  *              1 + (random_page_cost/2-1) * (relpages/effective_cache_size) ** 2
 162  * These give the right asymptotic behavior (=> 1.0 as relpages becomes
 163  * small, => random_page_cost as it becomes large) and meet in the middle
 164  * with the estimate that the cache is about 50% effective for a relation
 165  * of the same size as effective_cache_size.  (XXX this is probably all
 166  * wrong, but I haven't been able to find any theory about how effective
 167  * a disk cache should be presumed to be.)
 168  */
 169 static Cost
 170 cost_nonsequential_access(double relpages)
 171 {
 172         double          relsize;
 173
 174         /* don't crash on bad input data */
 175         if (relpages <= 0.0 || effective_cache_size <= 0.0)
 176                 return random_page_cost;
 177
 178         relsize = relpages / effective_cache_size;
 179
 180         if (relsize >= 1.0)
 181                 return random_page_cost * (1.0 - 0.5 / relsize);
 182         else
 183                 return 1.0 + (random_page_cost * 0.5 - 1.0) * relsize * relsize;
 184 }
 185
 186 /*
 187  * cost_index
 188  *        Determines and returns the cost of scanning a relation using an index.
 189  *
 190  *        NOTE: an indexscan plan node can actually represent several passes,
 191  *        but here we consider the cost of just one pass.
 192  *
 193  * 'root' is the query root
 194  * 'baserel' is the base relation the index is for
 195  * 'index' is the index to be used
 196  * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics)
 197  * 'is_injoin' is T if we are considering using the index scan as the inside
 198  *              of a nestloop join (hence, some of the indexQuals are join clauses)
 199  *
 200  * NOTE: 'indexQuals' must contain only clauses usable as index restrictions.
 201  * Any additional quals evaluated as qpquals may reduce the number of returned
 202  * tuples, but they won't reduce the number of tuples we have to fetch from
 203  * the table, so they don't reduce the scan cost.
 204  */
 205 void
 206 cost_index(Path *path, Query *root,
 207                    RelOptInfo *baserel,
 208                    IndexOptInfo *index,
 209                    List *indexQuals,
 210                    bool is_injoin)
 211 {
 212         Cost            startup_cost = 0;
 213         Cost            run_cost = 0;
 214         Cost            indexStartupCost;
 215         Cost            indexTotalCost;
 216         Selectivity indexSelectivity;
 217         double          indexCorrelation,
 218                                 csquared;
 219         Cost            min_IO_cost,
 220                                 max_IO_cost;
 221         Cost            cpu_per_tuple;
 222         double          tuples_fetched;
 223         double          pages_fetched;
 224         double          T,
 225                                 b;
 226
 227         /* Should only be applied to base relations */
 228         Assert(IsA(baserel, RelOptInfo) &&
 229                    IsA(index, IndexOptInfo));
 230         Assert(length(baserel->relids) == 1);
 231         Assert(baserel->rtekind == RTE_RELATION);
 232
 233         if (!enable_indexscan)
 234                 startup_cost += disable_cost;
 235
 236         /*
 237          * Call index-access-method-specific code to estimate the processing
 238          * cost for scanning the index, as well as the selectivity of the
 239          * index (ie, the fraction of main-table tuples we will have to
 240          * retrieve) and its correlation to the main-table tuple order.
 241          */
 242         OidFunctionCall8(index->amcostestimate,
 243                                          PointerGetDatum(root),
 244                                          PointerGetDatum(baserel),
 245                                          PointerGetDatum(index),
 246                                          PointerGetDatum(indexQuals),
 247                                          PointerGetDatum(&indexStartupCost),
 248                                          PointerGetDatum(&indexTotalCost),
 249                                          PointerGetDatum(&indexSelectivity),
 250                                          PointerGetDatum(&indexCorrelation));
 251
 252         /* all costs for touching index itself included here */
 253         startup_cost += indexStartupCost;
 254         run_cost += indexTotalCost - indexStartupCost;
 255
 256         /*----------
 257          * Estimate number of main-table tuples and pages fetched.
 258          *
 259          * When the index ordering is uncorrelated with the table ordering,
 260          * we use an approximation proposed by Mackert and Lohman, "Index Scans
 261          * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
 262          * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
 263          * The Mackert and Lohman approximation is that the number of pages
 264          * fetched is
 265          *      PF =
 266          *              min(2TNs/(2T+Ns), T)                    when T <= b
 267          *              2TNs/(2T+Ns)                                    when T > b and Ns <= 2Tb/(2T-b)
 268          *              b + (Ns - 2Tb/(2T-b))*(T-b)/T   when T > b and Ns > 2Tb/(2T-b)
 269          * where
 270          *              T = # pages in table
 271          *              N = # tuples in table
 272          *              s = selectivity = fraction of table to be scanned
 273          *              b = # buffer pages available (we include kernel space here)
 274          *
 275          * When the index ordering is exactly correlated with the table ordering
 276          * (just after a CLUSTER, for example), the number of pages fetched should
 277          * be just sT.  What's more, these will be sequential fetches, not the
 278          * random fetches that occur in the uncorrelated case.  So, depending on
 279          * the extent of correlation, we should estimate the actual I/O cost
 280          * somewhere between s * T * 1.0 and PF * random_cost.  We currently
 281          * interpolate linearly between these two endpoints based on the
 282          * correlation squared (XXX is that appropriate?).
 283          *
 284          * In any case the number of tuples fetched is Ns.
 285          *----------
 286          */
 287
 288         tuples_fetched = indexSelectivity * baserel->tuples;
 289         /* Don't believe estimates less than 1... */
 290         if (tuples_fetched < 1.0)
 291                 tuples_fetched = 1.0;
 292
 293         /* This part is the Mackert and Lohman formula */
 294
 295         T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
 296         b = (effective_cache_size > 1) ? effective_cache_size : 1.0;
 297
 298         if (T <= b)
 299         {
 300                 pages_fetched =
 301                         (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 302                 if (pages_fetched > T)
 303                         pages_fetched = T;
 304         }
 305         else
 306         {
 307                 double          lim;
 308
 309                 lim = (2.0 * T * b) / (2.0 * T - b);
 310                 if (tuples_fetched <= lim)
 311                 {
 312                         pages_fetched =
 313                                 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 314                 }
 315                 else
 316                 {
 317                         pages_fetched =
 318                                 b + (tuples_fetched - lim) * (T - b) / T;
 319                 }
 320         }
 321
 322         /*
 323          * min_IO_cost corresponds to the perfectly correlated case
 324          * (csquared=1), max_IO_cost to the perfectly uncorrelated case
 325          * (csquared=0).  Note that we just charge random_page_cost per page
 326          * in the uncorrelated case, rather than using
 327          * cost_nonsequential_access, since we've already accounted for
 328          * caching effects by using the Mackert model.
 329          */
 330         min_IO_cost = ceil(indexSelectivity * T);
 331         max_IO_cost = pages_fetched * random_page_cost;
 332
 333         /*
 334          * Now interpolate based on estimated index order correlation to get
 335          * total disk I/O cost for main table accesses.
 336          */
 337         csquared = indexCorrelation * indexCorrelation;
 338
 339         run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
 340
 341         /*
 342          * Estimate CPU costs per tuple.
 343          *
 344          * Normally the indexquals will be removed from the list of restriction
 345          * clauses that we have to evaluate as qpquals, so we should subtract
 346          * their costs from baserestrictcost.  XXX For a lossy index, not all
 347          * the quals will be removed and so we really shouldn't subtract their
 348          * costs; but detecting that seems more expensive than it's worth.
 349          * Also, if we are doing a join then some of the indexquals are join
 350          * clauses and shouldn't be subtracted.  Rather than work out exactly
 351          * how much to subtract, we don't subtract anything.
 352          */
 353         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 354
 355         if (!is_injoin)
 356                 cpu_per_tuple -= cost_qual_eval(indexQuals);
 357
 358         run_cost += cpu_per_tuple * tuples_fetched;
 359
 360         path->startup_cost = startup_cost;
 361         path->total_cost = startup_cost + run_cost;
 362 }
 363
 364 /*
 365  * cost_tidscan
 366  *        Determines and returns the cost of scanning a relation using TIDs.
 367  */
 368 void
 369 cost_tidscan(Path *path, Query *root,
 370                          RelOptInfo *baserel, List *tideval)
 371 {
 372         Cost            startup_cost = 0;
 373         Cost            run_cost = 0;
 374         Cost            cpu_per_tuple;
 375         int                     ntuples = length(tideval);
 376
 377         /* Should only be applied to base relations */
 378         Assert(length(baserel->relids) == 1);
 379         Assert(baserel->rtekind == RTE_RELATION);
 380
 381         if (!enable_tidscan)
 382                 startup_cost += disable_cost;
 383
 384         /* disk costs --- assume each tuple on a different page */
 385         run_cost += random_page_cost * ntuples;
 386
 387         /* CPU costs */
 388         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 389         run_cost += cpu_per_tuple * ntuples;
 390
 391         path->startup_cost = startup_cost;
 392         path->total_cost = startup_cost + run_cost;
 393 }
 394
 395 /*
 396  * cost_functionscan
 397  *        Determines and returns the cost of scanning a function RTE.
 398  */
 399 void
 400 cost_functionscan(Path *path, Query *root, RelOptInfo *baserel)
 401 {
 402         Cost            startup_cost = 0;
 403         Cost            run_cost = 0;
 404         Cost            cpu_per_tuple;
 405
 406         /* Should only be applied to base relations that are functions */
 407         Assert(length(baserel->relids) == 1);
 408         Assert(baserel->rtekind == RTE_FUNCTION);
 409
 410         /*
 411          * For now, estimate function's cost at one operator eval per function
 412          * call.  Someday we should revive the function cost estimate columns
 413          * in pg_proc...
 414          */
 415         cpu_per_tuple = cpu_operator_cost;
 416
 417         /* Add scanning CPU costs */
 418         cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost;
 419         run_cost += cpu_per_tuple * baserel->tuples;
 420
 421         path->startup_cost = startup_cost;
 422         path->total_cost = startup_cost + run_cost;
 423 }
 424
 425 /*
 426  * cost_sort
 427  *        Determines and returns the cost of sorting a relation, including
 428  *        the cost of reading the input data.
 429  *
 430  * If the total volume of data to sort is less than SortMem, we will do
 431  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
 432  * comparisons for t tuples.
 433  *
 434  * If the total volume exceeds SortMem, we switch to a tape-style merge
 435  * algorithm.  There will still be about t*log2(t) tuple comparisons in
 436  * total, but we will also need to write and read each tuple once per
 437  * merge pass.  We expect about ceil(log6(r)) merge passes where r is the
 438  * number of initial runs formed (log6 because tuplesort.c uses six-tape
 439  * merging).  Since the average initial run should be about twice SortMem,
 440  * we have
 441  *              disk traffic = 2 * relsize * ceil(log6(p / (2*SortMem)))
 442  *              cpu = comparison_cost * t * log2(t)
 443  *
 444  * The disk traffic is assumed to be half sequential and half random
 445  * accesses (XXX can't we refine that guess?)
 446  *
 447  * We charge two operator evals per tuple comparison, which should be in
 448  * the right ballpark in most cases.
 449  *
 450  * 'pathkeys' is a list of sort keys
 451  * 'input_cost' is the total cost for reading the input data
 452  * 'tuples' is the number of tuples in the relation
 453  * 'width' is the average tuple width in bytes
 454  *
 455  * NOTE: some callers currently pass NIL for pathkeys because they
 456  * can't conveniently supply the sort keys.  Since this routine doesn't
 457  * currently do anything with pathkeys anyway, that doesn't matter...
 458  * but if it ever does, it should react gracefully to lack of key data.
 459  * (Actually, the thing we'd most likely be interested in is just the number
 460  * of sort keys, which all callers *could* supply.)
 461  */
 462 void
 463 cost_sort(Path *path, Query *root,
 464                   List *pathkeys, Cost input_cost, double tuples, int width)
 465 {
 466         Cost            startup_cost = input_cost;
 467         Cost            run_cost = 0;
 468         double          nbytes = relation_byte_size(tuples, width);
 469         long            sortmembytes = SortMem * 1024L;
 470
 471         if (!enable_sort)
 472                 startup_cost += disable_cost;
 473
 474         /*
 475          * We want to be sure the cost of a sort is never estimated as zero,
 476          * even if passed-in tuple count is zero.  Besides, mustn't do
 477          * log(0)...
 478          */
 479         if (tuples < 2.0)
 480                 tuples = 2.0;
 481
 482         /*
 483          * CPU costs
 484          *
 485          * Assume about two operator evals per tuple comparison and N log2 N
 486          * comparisons
 487          */
 488         startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
 489
 490         /* disk costs */
 491         if (nbytes > sortmembytes)
 492         {
 493                 double          npages = ceil(nbytes / BLCKSZ);
 494                 double          nruns = nbytes / (sortmembytes * 2);
 495                 double          log_runs = ceil(LOG6(nruns));
 496                 double          npageaccesses;
 497
 498                 if (log_runs < 1.0)
 499                         log_runs = 1.0;
 500                 npageaccesses = 2.0 * npages * log_runs;
 501                 /* Assume half are sequential (cost 1), half are not */
 502                 startup_cost += npageaccesses *
 503                         (1.0 + cost_nonsequential_access(npages)) * 0.5;
 504         }
 505
 506         /*
 507          * Also charge a small amount (arbitrarily set equal to operator cost)
 508          * per extracted tuple.
 509          */
 510         run_cost += cpu_operator_cost * tuples;
 511
 512         path->startup_cost = startup_cost;
 513         path->total_cost = startup_cost + run_cost;
 514 }
 515
 516 /*
 517  * cost_material
 518  *        Determines and returns the cost of materializing a relation, including
 519  *        the cost of reading the input data.
 520  *
 521  * If the total volume of data to materialize exceeds SortMem, we will need
 522  * to write it to disk, so the cost is much higher in that case.
 523  */
 524 void
 525 cost_material(Path *path,
 526                           Cost input_cost, double tuples, int width)
 527 {
 528         Cost            startup_cost = input_cost;
 529         Cost            run_cost = 0;
 530         double          nbytes = relation_byte_size(tuples, width);
 531         long            sortmembytes = SortMem * 1024L;
 532
 533         /* disk costs */
 534         if (nbytes > sortmembytes)
 535         {
 536                 double          npages = ceil(nbytes / BLCKSZ);
 537
 538                 /* We'll write during startup and read during retrieval */
 539                 startup_cost += npages;
 540                 run_cost += npages;
 541         }
 542
 543         /*
 544          * Also charge a small amount per extracted tuple.  We use cpu_tuple_cost
 545          * so that it doesn't appear worthwhile to materialize a bare seqscan.
 546          */
 547         run_cost += cpu_tuple_cost * tuples;
 548
 549         path->startup_cost = startup_cost;
 550         path->total_cost = startup_cost + run_cost;
 551 }
 552
 553 /*
 554  * cost_agg
 555  *              Determines and returns the cost of performing an Agg plan node,
 556  *              including the cost of its input.
 557  *
 558  * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
 559  * are for appropriately-sorted input.
 560  */
 561 void
 562 cost_agg(Path *path, Query *root,
 563                  AggStrategy aggstrategy, int numAggs,
 564                  int numGroupCols, double numGroups,
 565                  Cost input_startup_cost, Cost input_total_cost,
 566                  double input_tuples)
 567 {
 568         Cost            startup_cost;
 569         Cost            total_cost;
 570
 571         /*
 572          * We charge one cpu_operator_cost per aggregate function per input
 573          * tuple, and another one per output tuple (corresponding to transfn
 574          * and finalfn calls respectively).  If we are grouping, we charge an
 575          * additional cpu_operator_cost per grouping column per input tuple
 576          * for grouping comparisons.
 577          *
 578          * We will produce a single output tuple if not grouping,
 579          * and a tuple per group otherwise.
 580          */
 581         if (aggstrategy == AGG_PLAIN)
 582         {
 583                 startup_cost = input_total_cost;
 584                 startup_cost += cpu_operator_cost * (input_tuples + 1) * numAggs;
 585                 /* we aren't grouping */
 586                 total_cost = startup_cost;
 587         }
 588         else if (aggstrategy == AGG_SORTED)
 589         {
 590                 /* Here we are able to deliver output on-the-fly */
 591                 startup_cost = input_startup_cost;
 592                 total_cost = input_total_cost;
 593                 total_cost += cpu_operator_cost * (input_tuples + numGroups) * numAggs;
 594                 total_cost += cpu_operator_cost * input_tuples * numGroupCols;
 595         }
 596         else
 597         {
 598                 /* must be AGG_HASHED */
 599                 startup_cost = input_total_cost;
 600                 startup_cost += cpu_operator_cost * input_tuples * numAggs;
 601                 startup_cost += cpu_operator_cost * input_tuples * numGroupCols;
 602                 total_cost = startup_cost;
 603                 total_cost += cpu_operator_cost * numGroups * numAggs;
 604         }
 605
 606         path->startup_cost = startup_cost;
 607         path->total_cost = total_cost;
 608 }
 609
 610 /*
 611  * cost_group
 612  *              Determines and returns the cost of performing a Group plan node,
 613  *              including the cost of its input.
 614  *
 615  * Note: caller must ensure that input costs are for appropriately-sorted
 616  * input.
 617  */
 618 void
 619 cost_group(Path *path, Query *root,
 620                    int numGroupCols, double numGroups,
 621                    Cost input_startup_cost, Cost input_total_cost,
 622                    double input_tuples)
 623 {
 624         Cost            startup_cost;
 625         Cost            total_cost;
 626
 627         startup_cost = input_startup_cost;
 628         total_cost = input_total_cost;
 629
 630         /*
 631          * Charge one cpu_operator_cost per comparison per input tuple. We
 632          * assume all columns get compared at most of the tuples.
 633          */
 634         total_cost += cpu_operator_cost * input_tuples * numGroupCols;
 635
 636         path->startup_cost = startup_cost;
 637         path->total_cost = total_cost;
 638 }
 639
 640 /*
 641  * cost_nestloop
 642  *        Determines and returns the cost of joining two relations using the
 643  *        nested loop algorithm.
 644  *
 645  * 'outer_path' is the path for the outer relation
 646  * 'inner_path' is the path for the inner relation
 647  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 648  */
 649 void
 650 cost_nestloop(Path *path, Query *root,
 651                           Path *outer_path,
 652                           Path *inner_path,
 653                           List *restrictlist)
 654 {
 655         Cost            startup_cost = 0;
 656         Cost            run_cost = 0;
 657         Cost            cpu_per_tuple;
 658         double          ntuples;
 659
 660         if (!enable_nestloop)
 661                 startup_cost += disable_cost;
 662
 663         /* cost of source data */
 664
 665         /*
 666          * NOTE: clearly, we must pay both outer and inner paths' startup_cost
 667          * before we can start returning tuples, so the join's startup cost is
 668          * their sum.  What's not so clear is whether the inner path's
 669          * startup_cost must be paid again on each rescan of the inner path.
 670          * This is not true if the inner path is materialized or is a hashjoin,
 671          * but probably is true otherwise.
 672          */
 673         startup_cost += outer_path->startup_cost + inner_path->startup_cost;
 674         run_cost += outer_path->total_cost - outer_path->startup_cost;
 675         run_cost += outer_path->parent->rows *
 676                 (inner_path->total_cost - inner_path->startup_cost);
 677         if (!(IsA(inner_path, MaterialPath) ||
 678                   IsA(inner_path, HashPath)) &&
 679                 outer_path->parent->rows > 1)
 680                 run_cost += (outer_path->parent->rows - 1) * inner_path->startup_cost;
 681
 682         /*
 683          * Number of tuples processed (not number emitted!).  If inner path is
 684          * an indexscan, be sure to use its estimated output row count, which
 685          * may be lower than the restriction-clause-only row count of its
 686          * parent.
 687          */
 688         if (IsA(inner_path, IndexPath))
 689                 ntuples = ((IndexPath *) inner_path)->rows;
 690         else
 691                 ntuples = inner_path->parent->rows;
 692         ntuples *= outer_path->parent->rows;
 693
 694         /* CPU costs */
 695         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 696         run_cost += cpu_per_tuple * ntuples;
 697
 698         path->startup_cost = startup_cost;
 699         path->total_cost = startup_cost + run_cost;
 700 }
 701
 702 /*
 703  * cost_mergejoin
 704  *        Determines and returns the cost of joining two relations using the
 705  *        merge join algorithm.
 706  *
 707  * 'outer_path' is the path for the outer relation
 708  * 'inner_path' is the path for the inner relation
 709  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 710  * 'mergeclauses' are the RestrictInfo nodes to use as merge clauses
 711  *              (this should be a subset of the restrictlist)
 712  * 'outersortkeys' and 'innersortkeys' are lists of the keys to be used
 713  *                              to sort the outer and inner relations, or NIL if no explicit
 714  *                              sort is needed because the source path is already ordered
 715  */
 716 void
 717 cost_mergejoin(Path *path, Query *root,
 718                            Path *outer_path,
 719                            Path *inner_path,
 720                            List *restrictlist,
 721                            List *mergeclauses,
 722                            List *outersortkeys,
 723                            List *innersortkeys)
 724 {
 725         Cost            startup_cost = 0;
 726         Cost            run_cost = 0;
 727         Cost            cpu_per_tuple;
 728         RestrictInfo *firstclause;
 729         Var                *leftvar;
 730         double          outer_rows,
 731                                 inner_rows;
 732         double          ntuples;
 733         Selectivity outerscansel,
 734                                 innerscansel;
 735         Path            sort_path;              /* dummy for result of cost_sort */
 736
 737         if (!enable_mergejoin)
 738                 startup_cost += disable_cost;
 739
 740         /*
 741          * A merge join will stop as soon as it exhausts either input stream.
 742          * Estimate fraction of the left and right inputs that will actually
 743          * need to be scanned.  We use only the first (most significant) merge
 744          * clause for this purpose.
 745          *
 746          * Since this calculation is somewhat expensive, and will be the same for
 747          * all mergejoin paths associated with the merge clause, we cache the
 748          * results in the RestrictInfo node.
 749          */
 750         firstclause = (RestrictInfo *) lfirst(mergeclauses);
 751         if (firstclause->left_mergescansel < 0)         /* not computed yet? */
 752                 mergejoinscansel(root, (Node *) firstclause->clause,
 753                                                  &firstclause->left_mergescansel,
 754                                                  &firstclause->right_mergescansel);
 755
 756         leftvar = get_leftop(firstclause->clause);
 757         Assert(IsA(leftvar, Var));
 758         if (VARISRELMEMBER(leftvar->varno, outer_path->parent))
 759         {
 760                 /* left side of clause is outer */
 761                 outerscansel = firstclause->left_mergescansel;
 762                 innerscansel = firstclause->right_mergescansel;
 763         }
 764         else
 765         {
 766                 /* left side of clause is inner */
 767                 outerscansel = firstclause->right_mergescansel;
 768                 innerscansel = firstclause->left_mergescansel;
 769         }
 770
 771         outer_rows = outer_path->parent->rows * outerscansel;
 772         inner_rows = inner_path->parent->rows * innerscansel;
 773
 774         /* cost of source data */
 775
 776         /*
 777          * Note we are assuming that each source tuple is fetched just once,
 778          * which is not right in the presence of equal keys.  If we had a way
 779          * of estimating the proportion of equal keys, we could apply a
 780          * correction factor...
 781          */
 782         if (outersortkeys)                      /* do we need to sort outer? */
 783         {
 784                 cost_sort(&sort_path,
 785                                   root,
 786                                   outersortkeys,
 787                                   outer_path->total_cost,
 788                                   outer_path->parent->rows,
 789                                   outer_path->parent->width);
 790                 startup_cost += sort_path.startup_cost;
 791                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
 792                         * outerscansel;
 793         }
 794         else
 795         {
 796                 startup_cost += outer_path->startup_cost;
 797                 run_cost += (outer_path->total_cost - outer_path->startup_cost)
 798                         * outerscansel;
 799         }
 800
 801         if (innersortkeys)                      /* do we need to sort inner? */
 802         {
 803                 cost_sort(&sort_path,
 804                                   root,
 805                                   innersortkeys,
 806                                   inner_path->total_cost,
 807                                   inner_path->parent->rows,
 808                                   inner_path->parent->width);
 809                 startup_cost += sort_path.startup_cost;
 810                 run_cost += (sort_path.total_cost - sort_path.startup_cost)
 811                         * innerscansel;
 812         }
 813         else
 814         {
 815                 startup_cost += inner_path->startup_cost;
 816                 run_cost += (inner_path->total_cost - inner_path->startup_cost)
 817                         * innerscansel;
 818         }
 819
 820         /*
 821          * The number of tuple comparisons needed depends drastically on the
 822          * number of equal keys in the two source relations, which we have no
 823          * good way of estimating.      (XXX could the MCV statistics help?)
 824          * Somewhat arbitrarily, we charge one tuple comparison (one
 825          * cpu_operator_cost) for each tuple in the two source relations.
 826          * This is probably a lower bound.
 827          */
 828         run_cost += cpu_operator_cost * (outer_rows + inner_rows);
 829
 830         /*
 831          * For each tuple that gets through the mergejoin proper, we charge
 832          * cpu_tuple_cost plus the cost of evaluating additional restriction
 833          * clauses that are to be applied at the join.  It's OK to use an
 834          * approximate selectivity here, since in most cases this is a minor
 835          * component of the cost.  NOTE: it's correct to use the unscaled rows
 836          * counts here, not the scaled-down counts we obtained above.
 837          */
 838         ntuples = approx_selectivity(root, mergeclauses) *
 839                 outer_path->parent->rows * inner_path->parent->rows;
 840
 841         /* CPU costs */
 842         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 843         run_cost += cpu_per_tuple * ntuples;
 844
 845         path->startup_cost = startup_cost;
 846         path->total_cost = startup_cost + run_cost;
 847 }
 848
 849 /*
 850  * cost_hashjoin
 851  *        Determines and returns the cost of joining two relations using the
 852  *        hash join algorithm.
 853  *
 854  * 'outer_path' is the path for the outer relation
 855  * 'inner_path' is the path for the inner relation
 856  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 857  * 'hashclauses' are the RestrictInfo nodes to use as hash clauses
 858  *              (this should be a subset of the restrictlist)
 859  */
 860 void
 861 cost_hashjoin(Path *path, Query *root,
 862                           Path *outer_path,
 863                           Path *inner_path,
 864                           List *restrictlist,
 865                           List *hashclauses)
 866 {
 867         Cost            startup_cost = 0;
 868         Cost            run_cost = 0;
 869         Cost            cpu_per_tuple;
 870         double          ntuples;
 871         double          outerbytes = relation_byte_size(outer_path->parent->rows,
 872                                                                                           outer_path->parent->width);
 873         double          innerbytes = relation_byte_size(inner_path->parent->rows,
 874                                                                                           inner_path->parent->width);
 875         long            hashtablebytes = SortMem * 1024L;
 876         Selectivity innerbucketsize;
 877         List       *hcl;
 878
 879         if (!enable_hashjoin)
 880                 startup_cost += disable_cost;
 881
 882         /* cost of source data */
 883         startup_cost += outer_path->startup_cost;
 884         run_cost += outer_path->total_cost - outer_path->startup_cost;
 885         startup_cost += inner_path->total_cost;
 886
 887         /* cost of computing hash function: must do it once per input tuple */
 888         startup_cost += cpu_operator_cost * inner_path->parent->rows;
 889         run_cost += cpu_operator_cost * outer_path->parent->rows;
 890
 891         /*
 892          * Determine bucketsize fraction for inner relation.  We use the
 893          * smallest bucketsize estimated for any individual hashclause;
 894          * this is undoubtedly conservative.
 895          */
 896         innerbucketsize = 1.0;
 897         foreach(hcl, hashclauses)
 898         {
 899                 RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(hcl);
 900                 Var                *left,
 901                                    *right;
 902                 Selectivity thisbucketsize;
 903
 904                 Assert(IsA(restrictinfo, RestrictInfo));
 905                 /* these must be OK, since check_hashjoinable accepted the clause */
 906                 left = get_leftop(restrictinfo->clause);
 907                 right = get_rightop(restrictinfo->clause);
 908
 909                 /*
 910                  * First we have to figure out which side of the hashjoin clause is the
 911                  * inner side.
 912                  *
 913                  * Since we tend to visit the same clauses over and over when planning
 914                  * a large query, we cache the bucketsize estimate in the RestrictInfo
 915                  * node to avoid repeated lookups of statistics.
 916                  */
 917                 if (VARISRELMEMBER(right->varno, inner_path->parent))
 918                 {
 919                         /* righthand side is inner */
 920                         thisbucketsize = restrictinfo->right_bucketsize;
 921                         if (thisbucketsize < 0)
 922                         {
 923                                 /* not cached yet */
 924                                 thisbucketsize = estimate_hash_bucketsize(root, right);
 925                                 restrictinfo->right_bucketsize = thisbucketsize;
 926                         }
 927                 }
 928                 else
 929                 {
 930                         Assert(VARISRELMEMBER(left->varno, inner_path->parent));
 931                         /* lefthand side is inner */
 932                         thisbucketsize = restrictinfo->left_bucketsize;
 933                         if (thisbucketsize < 0)
 934                         {
 935                                 /* not cached yet */
 936                                 thisbucketsize = estimate_hash_bucketsize(root, left);
 937                                 restrictinfo->left_bucketsize = thisbucketsize;
 938                         }
 939                 }
 940
 941                 if (innerbucketsize > thisbucketsize)
 942                         innerbucketsize = thisbucketsize;
 943         }
 944
 945         /*
 946          * The number of tuple comparisons needed is the number of outer
 947          * tuples times the typical number of tuples in a hash bucket, which
 948          * is the inner relation size times its bucketsize fraction. We charge
 949          * one cpu_operator_cost per tuple comparison.
 950          */
 951         run_cost += cpu_operator_cost * outer_path->parent->rows *
 952                 ceil(inner_path->parent->rows * innerbucketsize);
 953
 954         /*
 955          * For each tuple that gets through the hashjoin proper, we charge
 956          * cpu_tuple_cost plus the cost of evaluating additional restriction
 957          * clauses that are to be applied at the join.  It's OK to use an
 958          * approximate selectivity here, since in most cases this is a minor
 959          * component of the cost.
 960          */
 961         ntuples = approx_selectivity(root, hashclauses) *
 962                 outer_path->parent->rows * inner_path->parent->rows;
 963
 964         /* CPU costs */
 965         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 966         run_cost += cpu_per_tuple * ntuples;
 967
 968         /*
 969          * if inner relation is too big then we will need to "batch" the join,
 970          * which implies writing and reading most of the tuples to disk an
 971          * extra time.  Charge one cost unit per page of I/O (correct since it
 972          * should be nice and sequential...).  Writing the inner rel counts as
 973          * startup cost, all the rest as run cost.
 974          */
 975         if (innerbytes > hashtablebytes)
 976         {
 977                 double          outerpages = page_size(outer_path->parent->rows,
 978                                                                                    outer_path->parent->width);
 979                 double          innerpages = page_size(inner_path->parent->rows,
 980                                                                                    inner_path->parent->width);
 981
 982                 startup_cost += innerpages;
 983                 run_cost += innerpages + 2 * outerpages;
 984         }
 985
 986         /*
 987          * Bias against putting larger relation on inside.      We don't want an
 988          * absolute prohibition, though, since larger relation might have
 989          * better bucketsize --- and we can't trust the size estimates
 990          * unreservedly, anyway.  Instead, inflate the startup cost by the
 991          * square root of the size ratio.  (Why square root?  No real good
 992          * reason, but it seems reasonable...)
 993          */
 994         if (innerbytes > outerbytes && outerbytes > 0)
 995                 startup_cost *= sqrt(innerbytes / outerbytes);
 996
 997         path->startup_cost = startup_cost;
 998         path->total_cost = startup_cost + run_cost;
 999 }
1000
1001 /*
1002  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
1003  * divided by total tuples in relation) if the specified Var is used
1004  * as a hash key.
1005  *
1006  * XXX This is really pretty bogus since we're effectively assuming that the
1007  * distribution of hash keys will be the same after applying restriction
1008  * clauses as it was in the underlying relation.  However, we are not nearly
1009  * smart enough to figure out how the restrict clauses might change the
1010  * distribution, so this will have to do for now.
1011  *
1012  * We can get the number of buckets the executor will use for the given
1013  * input relation.      If the data were perfectly distributed, with the same
1014  * number of tuples going into each available bucket, then the bucketsize
1015  * fraction would be 1/nbuckets.  But this happy state of affairs will occur
1016  * only if (a) there are at least nbuckets distinct data values, and (b)
1017  * we have a not-too-skewed data distribution.  Otherwise the buckets will
1018  * be nonuniformly occupied.  If the other relation in the join has a key
1019  * distribution similar to this one's, then the most-loaded buckets are
1020  * exactly those that will be probed most often.  Therefore, the "average"
1021  * bucket size for costing purposes should really be taken as something close
1022  * to the "worst case" bucket size.  We try to estimate this by adjusting the
1023  * fraction if there are too few distinct data values, and then scaling up
1024  * by the ratio of the most common value's frequency to the average frequency.
1025  *
1026  * If no statistics are available, use a default estimate of 0.1.  This will
1027  * discourage use of a hash rather strongly if the inner relation is large,
1028  * which is what we want.  We do not want to hash unless we know that the
1029  * inner rel is well-dispersed (or the alternatives seem much worse).
1030  */
1031 static Selectivity
1032 estimate_hash_bucketsize(Query *root, Var *var)
1033 {
1034         Oid                     relid;
1035         RelOptInfo *rel;
1036         int                     virtualbuckets;
1037         int                     physicalbuckets;
1038         int                     numbatches;
1039         HeapTuple       tuple;
1040         Form_pg_statistic stats;
1041         double          estfract,
1042                                 ndistinct,
1043                                 mcvfreq,
1044                                 avgfreq;
1045         float4     *numbers;
1046         int                     nnumbers;
1047
1048         /*
1049          * Lookup info about var's relation and attribute; if none available,
1050          * return default estimate.
1051          */
1052         if (!IsA(var, Var))
1053                 return 0.1;
1054
1055         relid = getrelid(var->varno, root->rtable);
1056         if (relid == InvalidOid)
1057                 return 0.1;
1058
1059         rel = find_base_rel(root, var->varno);
1060
1061         if (rel->tuples <= 0.0 || rel->rows <= 0.0)
1062                 return 0.1;                             /* ensure we can divide below */
1063
1064         /* Get hash table size that executor would use for this relation */
1065         ExecChooseHashTableSize(rel->rows, rel->width,
1066                                                         &virtualbuckets,
1067                                                         &physicalbuckets,
1068                                                         &numbatches);
1069
1070         tuple = SearchSysCache(STATRELATT,
1071                                                    ObjectIdGetDatum(relid),
1072                                                    Int16GetDatum(var->varattno),
1073                                                    0, 0);
1074         if (!HeapTupleIsValid(tuple))
1075         {
1076                 /*
1077                  * Perhaps the Var is a system attribute; if so, it will have no
1078                  * entry in pg_statistic, but we may be able to guess something
1079                  * about its distribution anyway.
1080                  */
1081                 switch (var->varattno)
1082                 {
1083                         case ObjectIdAttributeNumber:
1084                         case SelfItemPointerAttributeNumber:
1085                                 /* these are unique, so buckets should be well-distributed */
1086                                 return 1.0 / (double) virtualbuckets;
1087                         case TableOidAttributeNumber:
1088                                 /* hashing this is a terrible idea... */
1089                                 return 1.0;
1090                 }
1091                 return 0.1;
1092         }
1093         stats = (Form_pg_statistic) GETSTRUCT(tuple);
1094
1095         /*
1096          * Obtain number of distinct data values in raw relation.
1097          */
1098         ndistinct = stats->stadistinct;
1099         if (ndistinct < 0.0)
1100                 ndistinct = -ndistinct * rel->tuples;
1101
1102         if (ndistinct <= 0.0)           /* ensure we can divide */
1103         {
1104                 ReleaseSysCache(tuple);
1105                 return 0.1;
1106         }
1107
1108         /* Also compute avg freq of all distinct data values in raw relation */
1109         avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
1110
1111         /*
1112          * Adjust ndistinct to account for restriction clauses.  Observe we
1113          * are assuming that the data distribution is affected uniformly by
1114          * the restriction clauses!
1115          *
1116          * XXX Possibly better way, but much more expensive: multiply by
1117          * selectivity of rel's restriction clauses that mention the target
1118          * Var.
1119          */
1120         ndistinct *= rel->rows / rel->tuples;
1121
1122         /*
1123          * Initial estimate of bucketsize fraction is 1/nbuckets as long as
1124          * the number of buckets is less than the expected number of distinct
1125          * values; otherwise it is 1/ndistinct.
1126          */
1127         if (ndistinct > (double) virtualbuckets)
1128                 estfract = 1.0 / (double) virtualbuckets;
1129         else
1130                 estfract = 1.0 / ndistinct;
1131
1132         /*
1133          * Look up the frequency of the most common value, if available.
1134          */
1135         mcvfreq = 0.0;
1136
1137         if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
1138                                                  STATISTIC_KIND_MCV, InvalidOid,
1139                                                  NULL, NULL, &numbers, &nnumbers))
1140         {
1141                 /*
1142                  * The first MCV stat is for the most common value.
1143                  */
1144                 if (nnumbers > 0)
1145                         mcvfreq = numbers[0];
1146                 free_attstatsslot(var->vartype, NULL, 0,
1147                                                   numbers, nnumbers);
1148         }
1149
1150         /*
1151          * Adjust estimated bucketsize upward to account for skewed
1152          * distribution.
1153          */
1154         if (avgfreq > 0.0 && mcvfreq > avgfreq)
1155                 estfract *= mcvfreq / avgfreq;
1156
1157         ReleaseSysCache(tuple);
1158
1159         return (Selectivity) estfract;
1160 }
1161
1162
1163 /*
1164  * cost_qual_eval
1165  *              Estimate the CPU cost of evaluating a WHERE clause (once).
1166  *              The input can be either an implicitly-ANDed list of boolean
1167  *              expressions, or a list of RestrictInfo nodes.
1168  */
1169 Cost
1170 cost_qual_eval(List *quals)
1171 {
1172         Cost            total = 0;
1173         List       *l;
1174
1175         /* We don't charge any cost for the implicit ANDing at top level ... */
1176
1177         foreach(l, quals)
1178         {
1179                 Node       *qual = (Node *) lfirst(l);
1180
1181                 /*
1182                  * RestrictInfo nodes contain an eval_cost field reserved for this
1183                  * routine's use, so that it's not necessary to evaluate the qual
1184                  * clause's cost more than once.  If the clause's cost hasn't been
1185                  * computed yet, the field will contain -1.
1186                  */
1187                 if (qual && IsA(qual, RestrictInfo))
1188                 {
1189                         RestrictInfo *restrictinfo = (RestrictInfo *) qual;
1190
1191                         if (restrictinfo->eval_cost < 0)
1192                         {
1193                                 restrictinfo->eval_cost = 0;
1194                                 cost_qual_eval_walker((Node *) restrictinfo->clause,
1195                                                                           &restrictinfo->eval_cost);
1196                         }
1197                         total += restrictinfo->eval_cost;
1198                 }
1199                 else
1200                 {
1201                         /* If it's a bare expression, must always do it the hard way */
1202                         cost_qual_eval_walker(qual, &total);
1203                 }
1204         }
1205         return total;
1206 }
1207
1208 static bool
1209 cost_qual_eval_walker(Node *node, Cost *total)
1210 {
1211         if (node == NULL)
1212                 return false;
1213
1214         /*
1215          * Our basic strategy is to charge one cpu_operator_cost for each
1216          * operator or function node in the given tree.  Vars and Consts are
1217          * charged zero, and so are boolean operators (AND, OR, NOT).
1218          * Simplistic, but a lot better than no model at all.
1219          *
1220          * Should we try to account for the possibility of short-circuit
1221          * evaluation of AND/OR?
1222          */
1223         if (IsA(node, Expr))
1224         {
1225                 Expr       *expr = (Expr *) node;
1226
1227                 switch (expr->opType)
1228                 {
1229                         case OP_EXPR:
1230                         case DISTINCT_EXPR:
1231                         case FUNC_EXPR:
1232                                 *total += cpu_operator_cost;
1233                                 break;
1234                         case OR_EXPR:
1235                         case AND_EXPR:
1236                         case NOT_EXPR:
1237                                 break;
1238                         case SUBPLAN_EXPR:
1239
1240                                 /*
1241                                  * A subplan node in an expression indicates that the
1242                                  * subplan will be executed on each evaluation, so charge
1243                                  * accordingly. (We assume that sub-selects that can be
1244                                  * executed as InitPlans have already been removed from
1245                                  * the expression.)
1246                                  *
1247                                  * NOTE: this logic should agree with the estimates used by
1248                                  * make_subplan() in plan/subselect.c.
1249                                  */
1250                                 {
1251                                         SubPlan    *subplan = (SubPlan *) expr->oper;
1252                                         Plan       *plan = subplan->plan;
1253                                         Cost            subcost;
1254
1255                                         if (subplan->sublink->subLinkType == EXISTS_SUBLINK)
1256                                         {
1257                                                 /* we only need to fetch 1 tuple */
1258                                                 subcost = plan->startup_cost +
1259                                                         (plan->total_cost - plan->startup_cost) / plan->plan_rows;
1260                                         }
1261                                         else if (subplan->sublink->subLinkType == ALL_SUBLINK ||
1262                                                          subplan->sublink->subLinkType == ANY_SUBLINK)
1263                                         {
1264                                                 /* assume we need 50% of the tuples */
1265                                                 subcost = plan->startup_cost +
1266                                                         0.50 * (plan->total_cost - plan->startup_cost);
1267                                                 /* XXX what if subplan has been materialized? */
1268                                         }
1269                                         else
1270                                         {
1271                                                 /* assume we need all tuples */
1272                                                 subcost = plan->total_cost;
1273                                         }
1274                                         *total += subcost;
1275                                 }
1276                                 break;
1277                 }
1278                 /* fall through to examine args of Expr node */
1279         }
1280         return expression_tree_walker(node, cost_qual_eval_walker,
1281                                                                   (void *) total);
1282 }
1283
1284
1285 /*
1286  * approx_selectivity
1287  *              Quick-and-dirty estimation of clause selectivities.
1288  *              The input can be either an implicitly-ANDed list of boolean
1289  *              expressions, or a list of RestrictInfo nodes (typically the latter).
1290  *
1291  * The "quick" part comes from caching the selectivity estimates so we can
1292  * avoid recomputing them later.  (Since the same clauses are typically
1293  * examined over and over in different possible join trees, this makes a
1294  * big difference.)
1295  *
1296  * The "dirty" part comes from the fact that the selectivities of multiple
1297  * clauses are estimated independently and multiplied together.  Now
1298  * clauselist_selectivity often can't do any better than that anyhow, but
1299  * for some situations (such as range constraints) it is smarter.
1300  *
1301  * Since we are only using the results to estimate how many potential
1302  * output tuples are generated and passed through qpqual checking, it
1303  * seems OK to live with the approximation.
1304  */
1305 static Selectivity
1306 approx_selectivity(Query *root, List *quals)
1307 {
1308         Selectivity total = 1.0;
1309         List       *l;
1310
1311         foreach(l, quals)
1312         {
1313                 Node       *qual = (Node *) lfirst(l);
1314                 Selectivity selec;
1315
1316                 /*
1317                  * RestrictInfo nodes contain a this_selec field reserved for this
1318                  * routine's use, so that it's not necessary to evaluate the qual
1319                  * clause's selectivity more than once.  If the clause's
1320                  * selectivity hasn't been computed yet, the field will contain
1321                  * -1.
1322                  */
1323                 if (qual && IsA(qual, RestrictInfo))
1324                 {
1325                         RestrictInfo *restrictinfo = (RestrictInfo *) qual;
1326
1327                         if (restrictinfo->this_selec < 0)
1328                                 restrictinfo->this_selec =
1329                                         clause_selectivity(root,
1330                                                                            (Node *) restrictinfo->clause,
1331                                                                            0);
1332                         selec = restrictinfo->this_selec;
1333                 }
1334                 else
1335                 {
1336                         /* If it's a bare expression, must always do it the hard way */
1337                         selec = clause_selectivity(root, qual, 0);
1338                 }
1339                 total *= selec;
1340         }
1341         return total;
1342 }
1343
1344
1345 /*
1346  * set_baserel_size_estimates
1347  *              Set the size estimates for the given base relation.
1348  *
1349  * The rel's targetlist and restrictinfo list must have been constructed
1350  * already.
1351  *
1352  * We set the following fields of the rel node:
1353  *      rows: the estimated number of output tuples (after applying
1354  *                restriction clauses).
1355  *      width: the estimated average output tuple width in bytes.
1356  *      baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
1357  */
1358 void
1359 set_baserel_size_estimates(Query *root, RelOptInfo *rel)
1360 {
1361         /* Should only be applied to base relations */
1362         Assert(length(rel->relids) == 1);
1363
1364         rel->rows = rel->tuples *
1365                 restrictlist_selectivity(root,
1366                                                                  rel->baserestrictinfo,
1367                                                                  lfirsti(rel->relids));
1368
1369         /*
1370          * Force estimate to be at least one row, to make explain output look
1371          * better and to avoid possible divide-by-zero when interpolating
1372          * cost.
1373          */
1374         if (rel->rows < 1.0)
1375                 rel->rows = 1.0;
1376
1377         rel->baserestrictcost = cost_qual_eval(rel->baserestrictinfo);
1378
1379         set_rel_width(root, rel);
1380 }
1381
1382 /*
1383  * set_joinrel_size_estimates
1384  *              Set the size estimates for the given join relation.
1385  *
1386  * The rel's targetlist must have been constructed already, and a
1387  * restriction clause list that matches the given component rels must
1388  * be provided.
1389  *
1390  * Since there is more than one way to make a joinrel for more than two
1391  * base relations, the results we get here could depend on which component
1392  * rel pair is provided.  In theory we should get the same answers no matter
1393  * which pair is provided; in practice, since the selectivity estimation
1394  * routines don't handle all cases equally well, we might not.  But there's
1395  * not much to be done about it.  (Would it make sense to repeat the
1396  * calculations for each pair of input rels that's encountered, and somehow
1397  * average the results?  Probably way more trouble than it's worth.)
1398  *
1399  * We set the same relnode fields as set_baserel_size_estimates() does.
1400  */
1401 void
1402 set_joinrel_size_estimates(Query *root, RelOptInfo *rel,
1403                                                    RelOptInfo *outer_rel,
1404                                                    RelOptInfo *inner_rel,
1405                                                    JoinType jointype,
1406                                                    List *restrictlist)
1407 {
1408         double          temp;
1409
1410         /* Start with the Cartesian product */
1411         temp = outer_rel->rows * inner_rel->rows;
1412
1413         /*
1414          * Apply join restrictivity.  Note that we are only considering
1415          * clauses that become restriction clauses at this join level; we are
1416          * not double-counting them because they were not considered in
1417          * estimating the sizes of the component rels.
1418          */
1419         temp *= restrictlist_selectivity(root,
1420                                                                          restrictlist,
1421                                                                          0);
1422
1423         /*
1424          * If we are doing an outer join, take that into account: the output
1425          * must be at least as large as the non-nullable input.  (Is there any
1426          * chance of being even smarter?)
1427          */
1428         switch (jointype)
1429         {
1430                 case JOIN_INNER:
1431                         break;
1432                 case JOIN_LEFT:
1433                         if (temp < outer_rel->rows)
1434                                 temp = outer_rel->rows;
1435                         break;
1436                 case JOIN_RIGHT:
1437                         if (temp < inner_rel->rows)
1438                                 temp = inner_rel->rows;
1439                         break;
1440                 case JOIN_FULL:
1441                         if (temp < outer_rel->rows)
1442                                 temp = outer_rel->rows;
1443                         if (temp < inner_rel->rows)
1444                                 temp = inner_rel->rows;
1445                         break;
1446                 default:
1447                         elog(ERROR, "set_joinrel_size_estimates: unsupported join type %d",
1448                                  (int) jointype);
1449                         break;
1450         }
1451
1452         /*
1453          * Force estimate to be at least one row, to make explain output look
1454          * better and to avoid possible divide-by-zero when interpolating
1455          * cost.
1456          */
1457         if (temp < 1.0)
1458                 temp = 1.0;
1459
1460         rel->rows = temp;
1461
1462         /*
1463          * We could apply set_rel_width() to compute the output tuple width
1464          * from scratch, but at present it's always just the sum of the input
1465          * widths, so why work harder than necessary?  If relnode.c is ever
1466          * taught to remove unneeded columns from join targetlists, go back to
1467          * using set_rel_width here.
1468          */
1469         rel->width = outer_rel->width + inner_rel->width;
1470 }
1471
1472 /*
1473  * set_function_size_estimates
1474  *              Set the size estimates for a base relation that is a function call.
1475  *
1476  * The rel's targetlist and restrictinfo list must have been constructed
1477  * already.
1478  *
1479  * We set the following fields of the rel node:
1480  *      rows: the estimated number of output tuples (after applying
1481  *                restriction clauses).
1482  *      width: the estimated average output tuple width in bytes.
1483  *      baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
1484  */
1485 void
1486 set_function_size_estimates(Query *root, RelOptInfo *rel)
1487 {
1488         /* Should only be applied to base relations that are functions */
1489         Assert(length(rel->relids) == 1);
1490         Assert(rel->rtekind == RTE_FUNCTION);
1491
1492         /*
1493          * Estimate number of rows the function itself will return.
1494          *
1495          * XXX no idea how to do this yet; but should at least check whether
1496          * function returns set or not...
1497          */
1498         rel->tuples = 1000;
1499
1500         /* Now estimate number of output rows */
1501         rel->rows = rel->tuples *
1502                 restrictlist_selectivity(root,
1503                                                                  rel->baserestrictinfo,
1504                                                                  lfirsti(rel->relids));
1505
1506         /*
1507          * Force estimate to be at least one row, to make explain output look
1508          * better and to avoid possible divide-by-zero when interpolating
1509          * cost.
1510          */
1511         if (rel->rows < 1.0)
1512                 rel->rows = 1.0;
1513
1514         rel->baserestrictcost = cost_qual_eval(rel->baserestrictinfo);
1515
1516         set_rel_width(root, rel);
1517 }
1518
1519
1520 /*
1521  * set_rel_width
1522  *              Set the estimated output width of the relation.
1523  *
1524  * NB: this works best on base relations because it prefers to look at
1525  * real Vars.  It will fail to make use of pg_statistic info when applied
1526  * to a subquery relation, even if the subquery outputs are simple vars
1527  * that we could have gotten info for.  Is it worth trying to be smarter
1528  * about subqueries?
1529  */
1530 static void
1531 set_rel_width(Query *root, RelOptInfo *rel)
1532 {
1533         int32           tuple_width = 0;
1534         List       *tllist;
1535
1536         foreach(tllist, rel->targetlist)
1537         {
1538                 TargetEntry *tle = (TargetEntry *) lfirst(tllist);
1539                 int32           item_width;
1540
1541                 /*
1542                  * If it's a Var, try to get statistical info from pg_statistic.
1543                  */
1544                 if (tle->expr && IsA(tle->expr, Var))
1545                 {
1546                         Var                *var = (Var *) tle->expr;
1547                         Oid                     relid;
1548
1549                         relid = getrelid(var->varno, root->rtable);
1550                         if (relid != InvalidOid)
1551                         {
1552                                 item_width = get_attavgwidth(relid, var->varattno);
1553                                 if (item_width > 0)
1554                                 {
1555                                         tuple_width += item_width;
1556                                         continue;
1557                                 }
1558                         }
1559                 }
1560
1561                 /*
1562                  * Not a Var, or can't find statistics for it.  Estimate using
1563                  * just the type info.
1564                  */
1565                 item_width = get_typavgwidth(tle->resdom->restype,
1566                                                                          tle->resdom->restypmod);
1567                 Assert(item_width > 0);
1568                 tuple_width += item_width;
1569         }
1570         Assert(tuple_width >= 0);
1571         rel->width = tuple_width;
1572 }
1573
1574 /*
1575  * relation_byte_size
1576  *        Estimate the storage space in bytes for a given number of tuples
1577  *        of a given width (size in bytes).
1578  */
1579 static double
1580 relation_byte_size(double tuples, int width)
1581 {
1582         return tuples * (MAXALIGN(width) + MAXALIGN(sizeof(HeapTupleData)));
1583 }
1584
1585 /*
1586  * page_size
1587  *        Returns an estimate of the number of pages covered by a given
1588  *        number of tuples of a given width (size in bytes).
1589  */
1590 static double
1591 page_size(double tuples, int width)
1592 {
1593         return ceil(relation_byte_size(tuples, width) / BLCKSZ);
1594 }