granicus.if.org Git - postgresql/blob - src/backend/optimizer/path/costsize.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * costsize.c
   4  *        Routines to compute (and set) relation sizes and path costs
   5  *
   6  * Path costs are measured in units of disk accesses: one sequential page
   7  * fetch has cost 1.  All else is scaled relative to a page fetch, using
   8  * the scaling parameters
   9  *
  10  *      random_page_cost        Cost of a non-sequential page fetch
  11  *      cpu_tuple_cost          Cost of typical CPU time to process a tuple
  12  *      cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
  13  *      cpu_operator_cost       Cost of CPU time to process a typical WHERE operator
  14  *
  15  * We also use a rough estimate "effective_cache_size" of the number of
  16  * disk pages in Postgres + OS-level disk cache.  (We can't simply use
  17  * NBuffers for this purpose because that would ignore the effects of
  18  * the kernel's disk cache.)
  19  *
  20  * Obviously, taking constants for these values is an oversimplification,
  21  * but it's tough enough to get any useful estimates even at this level of
  22  * detail.      Note that all of these parameters are user-settable, in case
  23  * the default values are drastically off for a particular platform.
  24  *
  25  * We compute two separate costs for each path:
  26  *              total_cost: total estimated cost to fetch all tuples
  27  *              startup_cost: cost that is expended before first tuple is fetched
  28  * In some scenarios, such as when there is a LIMIT or we are implementing
  29  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
  30  * path's result.  A caller can estimate the cost of fetching a partial
  31  * result by interpolating between startup_cost and total_cost.  In detail:
  32  *              actual_cost = startup_cost +
  33  *                      (total_cost - startup_cost) * tuples_to_fetch / path->parent->rows;
  34  * Note that a base relation's rows count (and, by extension, plan_rows for
  35  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
  36  * that this equation works properly.  (Also, these routines guarantee not to
  37  * set the rows count to zero, so there will be no zero divide.)  The LIMIT is
  38  * applied as a top-level plan node.
  39  *
  40  *
  41  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
  42  * Portions Copyright (c) 1994, Regents of the University of California
  43  *
  44  * IDENTIFICATION
  45  *        $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.74 2001/05/20 20:28:18 tgl Exp $
  46  *
  47  *-------------------------------------------------------------------------
  48  */
  49
  50 #include "postgres.h"
  51
  52 #include <math.h>
  53
  54 #include "catalog/pg_statistic.h"
  55 #include "executor/nodeHash.h"
  56 #include "miscadmin.h"
  57 #include "optimizer/clauses.h"
  58 #include "optimizer/cost.h"
  59 #include "optimizer/pathnode.h"
  60 #include "parser/parsetree.h"
  61 #include "utils/lsyscache.h"
  62 #include "utils/syscache.h"
  63
  64
  65 #define LOG2(x)  (log(x) / 0.693147180559945)
  66 #define LOG6(x)  (log(x) / 1.79175946922805)
  67
  68
  69 double          effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
  70 double          random_page_cost = DEFAULT_RANDOM_PAGE_COST;
  71 double          cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
  72 double          cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
  73 double          cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
  74
  75 Cost            disable_cost = 100000000.0;
  76
  77 bool            enable_seqscan = true;
  78 bool            enable_indexscan = true;
  79 bool            enable_tidscan = true;
  80 bool            enable_sort = true;
  81 bool            enable_nestloop = true;
  82 bool            enable_mergejoin = true;
  83 bool            enable_hashjoin = true;
  84
  85
  86 static bool cost_qual_eval_walker(Node *node, Cost *total);
  87 static void set_rel_width(Query *root, RelOptInfo *rel);
  88 static double relation_byte_size(double tuples, int width);
  89 static double page_size(double tuples, int width);
  90
  91
  92 /*
  93  * cost_seqscan
  94  *        Determines and returns the cost of scanning a relation sequentially.
  95  *
  96  * Note: for historical reasons, this routine and the others in this module
  97  * use the passed result Path only to store their startup_cost and total_cost
  98  * results into.  All the input data they need is passed as separate
  99  * parameters, even though much of it could be extracted from the Path.
 100  */
 101 void
 102 cost_seqscan(Path *path, RelOptInfo *baserel)
 103 {
 104         Cost            startup_cost = 0;
 105         Cost            run_cost = 0;
 106         Cost            cpu_per_tuple;
 107
 108         /* Should only be applied to base relations */
 109         Assert(length(baserel->relids) == 1);
 110         Assert(!baserel->issubquery);
 111
 112         if (!enable_seqscan)
 113                 startup_cost += disable_cost;
 114
 115         /*
 116          * disk costs
 117          *
 118          * The cost of reading a page sequentially is 1.0, by definition. Note
 119          * that the Unix kernel will typically do some amount of read-ahead
 120          * optimization, so that this cost is less than the true cost of
 121          * reading a page from disk.  We ignore that issue here, but must take
 122          * it into account when estimating the cost of non-sequential
 123          * accesses!
 124          */
 125         run_cost += baserel->pages; /* sequential fetches with cost 1.0 */
 126
 127         /* CPU costs */
 128         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 129         run_cost += cpu_per_tuple * baserel->tuples;
 130
 131         path->startup_cost = startup_cost;
 132         path->total_cost = startup_cost + run_cost;
 133 }
 134
 135 /*
 136  * cost_nonsequential_access
 137  *        Estimate the cost of accessing one page at random from a relation
 138  *        (or sort temp file) of the given size in pages.
 139  *
 140  * The simplistic model that the cost is random_page_cost is what we want
 141  * to use for large relations; but for small ones that is a serious
 142  * overestimate because of the effects of caching.      This routine tries to
 143  * account for that.
 144  *
 145  * Unfortunately we don't have any good way of estimating the effective cache
 146  * size we are working with --- we know that Postgres itself has NBuffers
 147  * internal buffers, but the size of the kernel's disk cache is uncertain,
 148  * and how much of it we get to use is even less certain.  We punt the problem
 149  * for now by assuming we are given an effective_cache_size parameter.
 150  *
 151  * Given a guesstimated cache size, we estimate the actual I/O cost per page
 152  * with the entirely ad-hoc equations:
 153  *      for rel_size <= effective_cache_size:
 154  *              1 + (random_page_cost/2-1) * (rel_size/effective_cache_size) ** 2
 155  *      for rel_size >= effective_cache_size:
 156  *              random_page_cost * (1 - (effective_cache_size/rel_size)/2)
 157  * These give the right asymptotic behavior (=> 1.0 as rel_size becomes
 158  * small, => random_page_cost as it becomes large) and meet in the middle
 159  * with the estimate that the cache is about 50% effective for a relation
 160  * of the same size as effective_cache_size.  (XXX this is probably all
 161  * wrong, but I haven't been able to find any theory about how effective
 162  * a disk cache should be presumed to be.)
 163  */
 164 static Cost
 165 cost_nonsequential_access(double relpages)
 166 {
 167         double          relsize;
 168
 169         /* don't crash on bad input data */
 170         if (relpages <= 0.0 || effective_cache_size <= 0.0)
 171                 return random_page_cost;
 172
 173         relsize = relpages / effective_cache_size;
 174
 175         if (relsize >= 1.0)
 176                 return random_page_cost * (1.0 - 0.5 / relsize);
 177         else
 178                 return 1.0 + (random_page_cost * 0.5 - 1.0) * relsize * relsize;
 179 }
 180
 181 /*
 182  * cost_index
 183  *        Determines and returns the cost of scanning a relation using an index.
 184  *
 185  *        NOTE: an indexscan plan node can actually represent several passes,
 186  *        but here we consider the cost of just one pass.
 187  *
 188  * 'root' is the query root
 189  * 'baserel' is the base relation the index is for
 190  * 'index' is the index to be used
 191  * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics)
 192  * 'is_injoin' is T if we are considering using the index scan as the inside
 193  *              of a nestloop join (hence, some of the indexQuals are join clauses)
 194  *
 195  * NOTE: 'indexQuals' must contain only clauses usable as index restrictions.
 196  * Any additional quals evaluated as qpquals may reduce the number of returned
 197  * tuples, but they won't reduce the number of tuples we have to fetch from
 198  * the table, so they don't reduce the scan cost.
 199  */
 200 void
 201 cost_index(Path *path, Query *root,
 202                    RelOptInfo *baserel,
 203                    IndexOptInfo *index,
 204                    List *indexQuals,
 205                    bool is_injoin)
 206 {
 207         Cost            startup_cost = 0;
 208         Cost            run_cost = 0;
 209         Cost            indexStartupCost;
 210         Cost            indexTotalCost;
 211         Selectivity indexSelectivity;
 212         double          indexCorrelation,
 213                                 csquared;
 214         Cost            min_IO_cost,
 215                                 max_IO_cost;
 216         Cost            cpu_per_tuple;
 217         double          tuples_fetched;
 218         double          pages_fetched;
 219         double          T,
 220                                 b;
 221
 222         /* Should only be applied to base relations */
 223         Assert(IsA(baserel, RelOptInfo) &&IsA(index, IndexOptInfo));
 224         Assert(length(baserel->relids) == 1);
 225         Assert(!baserel->issubquery);
 226
 227         if (!enable_indexscan && !is_injoin)
 228                 startup_cost += disable_cost;
 229
 230         /*
 231          * Call index-access-method-specific code to estimate the processing
 232          * cost for scanning the index, as well as the selectivity of the
 233          * index (ie, the fraction of main-table tuples we will have to
 234          * retrieve) and its correlation to the main-table tuple order.
 235          */
 236         OidFunctionCall8(index->amcostestimate,
 237                                          PointerGetDatum(root),
 238                                          PointerGetDatum(baserel),
 239                                          PointerGetDatum(index),
 240                                          PointerGetDatum(indexQuals),
 241                                          PointerGetDatum(&indexStartupCost),
 242                                          PointerGetDatum(&indexTotalCost),
 243                                          PointerGetDatum(&indexSelectivity),
 244                                          PointerGetDatum(&indexCorrelation));
 245
 246         /* all costs for touching index itself included here */
 247         startup_cost += indexStartupCost;
 248         run_cost += indexTotalCost - indexStartupCost;
 249
 250         /*----------
 251          * Estimate number of main-table tuples and pages fetched.
 252          *
 253          * When the index ordering is uncorrelated with the table ordering,
 254          * we use an approximation proposed by Mackert and Lohman, "Index Scans
 255          * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
 256          * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
 257          * The Mackert and Lohman approximation is that the number of pages
 258          * fetched is
 259          *      PF =
 260          *              min(2TNs/(2T+Ns), T)                    when T <= b
 261          *              2TNs/(2T+Ns)                                    when T > b and Ns <= 2Tb/(2T-b)
 262          *              b + (Ns - 2Tb/(2T-b))*(T-b)/T   when T > b and Ns > 2Tb/(2T-b)
 263          * where
 264          *              T = # pages in table
 265          *              N = # tuples in table
 266          *              s = selectivity = fraction of table to be scanned
 267          *              b = # buffer pages available (we include kernel space here)
 268          *
 269          * When the index ordering is exactly correlated with the table ordering
 270          * (just after a CLUSTER, for example), the number of pages fetched should
 271          * be just sT.  What's more, these will be sequential fetches, not the
 272          * random fetches that occur in the uncorrelated case.  So, depending on
 273          * the extent of correlation, we should estimate the actual I/O cost
 274          * somewhere between s * T * 1.0 and PF * random_cost.  We currently
 275          * interpolate linearly between these two endpoints based on the
 276          * correlation squared (XXX is that appropriate?).
 277          *
 278          * In any case the number of tuples fetched is Ns.
 279          *----------
 280          */
 281
 282         tuples_fetched = indexSelectivity * baserel->tuples;
 283         /* Don't believe estimates less than 1... */
 284         if (tuples_fetched < 1.0)
 285                 tuples_fetched = 1.0;
 286
 287         /* This part is the Mackert and Lohman formula */
 288
 289         T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
 290         b = (effective_cache_size > 1) ? effective_cache_size : 1.0;
 291
 292         if (T <= b)
 293         {
 294                 pages_fetched =
 295                         (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 296                 if (pages_fetched > T)
 297                         pages_fetched = T;
 298         }
 299         else
 300         {
 301                 double  lim;
 302
 303                 lim = (2.0 * T * b) / (2.0 * T - b);
 304                 if (tuples_fetched <= lim)
 305                 {
 306                         pages_fetched =
 307                                 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
 308                 }
 309                 else
 310                 {
 311                         pages_fetched =
 312                                 b + (tuples_fetched - lim) * (T - b) / T;
 313                 }
 314         }
 315
 316         /*
 317          * min_IO_cost corresponds to the perfectly correlated case (csquared=1),
 318          * max_IO_cost to the perfectly uncorrelated case (csquared=0).  Note
 319          * that we just charge random_page_cost per page in the uncorrelated
 320          * case, rather than using cost_nonsequential_access, since we've already
 321          * accounted for caching effects by using the Mackert model.
 322          */
 323         min_IO_cost = ceil(indexSelectivity * T);
 324         max_IO_cost = pages_fetched * random_page_cost;
 325
 326         /*
 327          * Now interpolate based on estimated index order correlation
 328          * to get total disk I/O cost for main table accesses.
 329          */
 330         csquared = indexCorrelation * indexCorrelation;
 331
 332         run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
 333
 334         /*
 335          * Estimate CPU costs per tuple.
 336          *
 337          * Normally the indexquals will be removed from the list of
 338          * restriction clauses that we have to evaluate as qpquals, so we
 339          * should subtract their costs from baserestrictcost.  For a lossy
 340          * index, however, we will have to recheck all the quals and so
 341          * mustn't subtract anything. Also, if we are doing a join then some
 342          * of the indexquals are join clauses and shouldn't be subtracted.
 343          * Rather than work out exactly how much to subtract, we don't
 344          * subtract anything in that case either.
 345          */
 346         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 347
 348         if (!index->lossy && !is_injoin)
 349                 cpu_per_tuple -= cost_qual_eval(indexQuals);
 350
 351         run_cost += cpu_per_tuple * tuples_fetched;
 352
 353         path->startup_cost = startup_cost;
 354         path->total_cost = startup_cost + run_cost;
 355 }
 356
 357 /*
 358  * cost_tidscan
 359  *        Determines and returns the cost of scanning a relation using tid-s.
 360  */
 361 void
 362 cost_tidscan(Path *path, RelOptInfo *baserel, List *tideval)
 363 {
 364         Cost            startup_cost = 0;
 365         Cost            run_cost = 0;
 366         Cost            cpu_per_tuple;
 367         int                     ntuples = length(tideval);
 368
 369         if (!enable_tidscan)
 370                 startup_cost += disable_cost;
 371
 372         /* disk costs --- assume each tuple on a different page */
 373         run_cost += random_page_cost * ntuples;
 374
 375         /* CPU costs */
 376         cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
 377         run_cost += cpu_per_tuple * ntuples;
 378
 379         path->startup_cost = startup_cost;
 380         path->total_cost = startup_cost + run_cost;
 381 }
 382
 383 /*
 384  * cost_sort
 385  *        Determines and returns the cost of sorting a relation.
 386  *
 387  * The cost of supplying the input data is NOT included; the caller should
 388  * add that cost to both startup and total costs returned from this routine!
 389  *
 390  * If the total volume of data to sort is less than SortMem, we will do
 391  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
 392  * comparisons for t tuples.
 393  *
 394  * If the total volume exceeds SortMem, we switch to a tape-style merge
 395  * algorithm.  There will still be about t*log2(t) tuple comparisons in
 396  * total, but we will also need to write and read each tuple once per
 397  * merge pass.  We expect about ceil(log6(r)) merge passes where r is the
 398  * number of initial runs formed (log6 because tuplesort.c uses six-tape
 399  * merging).  Since the average initial run should be about twice SortMem,
 400  * we have
 401  *              disk traffic = 2 * relsize * ceil(log6(p / (2*SortMem)))
 402  *              cpu = comparison_cost * t * log2(t)
 403  *
 404  * The disk traffic is assumed to be half sequential and half random
 405  * accesses (XXX can't we refine that guess?)
 406  *
 407  * We charge two operator evals per tuple comparison, which should be in
 408  * the right ballpark in most cases.
 409  *
 410  * 'pathkeys' is a list of sort keys
 411  * 'tuples' is the number of tuples in the relation
 412  * 'width' is the average tuple width in bytes
 413  *
 414  * NOTE: some callers currently pass NIL for pathkeys because they
 415  * can't conveniently supply the sort keys.  Since this routine doesn't
 416  * currently do anything with pathkeys anyway, that doesn't matter...
 417  * but if it ever does, it should react gracefully to lack of key data.
 418  */
 419 void
 420 cost_sort(Path *path, List *pathkeys, double tuples, int width)
 421 {
 422         Cost            startup_cost = 0;
 423         Cost            run_cost = 0;
 424         double          nbytes = relation_byte_size(tuples, width);
 425         long            sortmembytes = SortMem * 1024L;
 426
 427         if (!enable_sort)
 428                 startup_cost += disable_cost;
 429
 430         /*
 431          * We want to be sure the cost of a sort is never estimated as zero,
 432          * even if passed-in tuple count is zero.  Besides, mustn't do
 433          * log(0)...
 434          */
 435         if (tuples < 2.0)
 436                 tuples = 2.0;
 437
 438         /*
 439          * CPU costs
 440          *
 441          * Assume about two operator evals per tuple comparison and N log2 N
 442          * comparisons
 443          */
 444         startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
 445
 446         /* disk costs */
 447         if (nbytes > sortmembytes)
 448         {
 449                 double          npages = ceil(nbytes / BLCKSZ);
 450                 double          nruns = nbytes / (sortmembytes * 2);
 451                 double          log_runs = ceil(LOG6(nruns));
 452                 double          npageaccesses;
 453
 454                 if (log_runs < 1.0)
 455                         log_runs = 1.0;
 456                 npageaccesses = 2.0 * npages * log_runs;
 457                 /* Assume half are sequential (cost 1), half are not */
 458                 startup_cost += npageaccesses *
 459                         (1.0 + cost_nonsequential_access(npages)) * 0.5;
 460         }
 461
 462         /*
 463          * Note: should we bother to assign a nonzero run_cost to reflect the
 464          * overhead of extracting tuples from the sort result?  Probably not
 465          * worth worrying about.
 466          */
 467         path->startup_cost = startup_cost;
 468         path->total_cost = startup_cost + run_cost;
 469 }
 470
 471
 472 /*
 473  * cost_nestloop
 474  *        Determines and returns the cost of joining two relations using the
 475  *        nested loop algorithm.
 476  *
 477  * 'outer_path' is the path for the outer relation
 478  * 'inner_path' is the path for the inner relation
 479  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 480  */
 481 void
 482 cost_nestloop(Path *path,
 483                           Path *outer_path,
 484                           Path *inner_path,
 485                           List *restrictlist)
 486 {
 487         Cost            startup_cost = 0;
 488         Cost            run_cost = 0;
 489         Cost            cpu_per_tuple;
 490         double          ntuples;
 491
 492         if (!enable_nestloop)
 493                 startup_cost += disable_cost;
 494
 495         /* cost of source data */
 496
 497         /*
 498          * NOTE: clearly, we must pay both outer and inner paths' startup_cost
 499          * before we can start returning tuples, so the join's startup cost
 500          * is their sum.  What's not so clear is whether the inner path's
 501          * startup_cost must be paid again on each rescan of the inner path.
 502          * This is not true if the inner path is materialized, but probably
 503          * is true otherwise.  Since we don't yet have clean handling of the
 504          * decision whether to materialize a path, we can't tell here which
 505          * will happen.  As a compromise, charge 50% of the inner startup cost
 506          * for each restart.
 507          */
 508         startup_cost += outer_path->startup_cost + inner_path->startup_cost;
 509         run_cost += outer_path->total_cost - outer_path->startup_cost;
 510         run_cost += outer_path->parent->rows *
 511                 (inner_path->total_cost - inner_path->startup_cost);
 512         if (outer_path->parent->rows > 1)
 513                 run_cost += (outer_path->parent->rows - 1) * inner_path->startup_cost;
 514
 515         /*
 516          * Number of tuples processed (not number emitted!).  If inner path is
 517          * an indexscan, be sure to use its estimated output row count, which
 518          * may be lower than the restriction-clause-only row count of its
 519          * parent.
 520          */
 521         if (IsA(inner_path, IndexPath))
 522                 ntuples = ((IndexPath *) inner_path)->rows;
 523         else
 524                 ntuples = inner_path->parent->rows;
 525         ntuples *= outer_path->parent->rows;
 526
 527         /* CPU costs */
 528         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 529         run_cost += cpu_per_tuple * ntuples;
 530
 531         path->startup_cost = startup_cost;
 532         path->total_cost = startup_cost + run_cost;
 533 }
 534
 535 /*
 536  * cost_mergejoin
 537  *        Determines and returns the cost of joining two relations using the
 538  *        merge join algorithm.
 539  *
 540  * 'outer_path' is the path for the outer relation
 541  * 'inner_path' is the path for the inner relation
 542  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 543  * 'outersortkeys' and 'innersortkeys' are lists of the keys to be used
 544  *                              to sort the outer and inner relations, or NIL if no explicit
 545  *                              sort is needed because the source path is already ordered
 546  */
 547 void
 548 cost_mergejoin(Path *path,
 549                            Path *outer_path,
 550                            Path *inner_path,
 551                            List *restrictlist,
 552                            List *outersortkeys,
 553                            List *innersortkeys)
 554 {
 555         Cost            startup_cost = 0;
 556         Cost            run_cost = 0;
 557         Cost            cpu_per_tuple;
 558         double          ntuples;
 559         Path            sort_path;              /* dummy for result of cost_sort */
 560
 561         if (!enable_mergejoin)
 562                 startup_cost += disable_cost;
 563
 564         /* cost of source data */
 565
 566         /*
 567          * Note we are assuming that each source tuple is fetched just once,
 568          * which is not right in the presence of equal keys.  If we had a way
 569          * of estimating the proportion of equal keys, we could apply a
 570          * correction factor...
 571          */
 572         if (outersortkeys)                      /* do we need to sort outer? */
 573         {
 574                 startup_cost += outer_path->total_cost;
 575                 cost_sort(&sort_path,
 576                                   outersortkeys,
 577                                   outer_path->parent->rows,
 578                                   outer_path->parent->width);
 579                 startup_cost += sort_path.startup_cost;
 580                 run_cost += sort_path.total_cost - sort_path.startup_cost;
 581         }
 582         else
 583         {
 584                 startup_cost += outer_path->startup_cost;
 585                 run_cost += outer_path->total_cost - outer_path->startup_cost;
 586         }
 587
 588         if (innersortkeys)                      /* do we need to sort inner? */
 589         {
 590                 startup_cost += inner_path->total_cost;
 591                 cost_sort(&sort_path,
 592                                   innersortkeys,
 593                                   inner_path->parent->rows,
 594                                   inner_path->parent->width);
 595                 startup_cost += sort_path.startup_cost;
 596                 run_cost += sort_path.total_cost - sort_path.startup_cost;
 597         }
 598         else
 599         {
 600                 startup_cost += inner_path->startup_cost;
 601                 run_cost += inner_path->total_cost - inner_path->startup_cost;
 602         }
 603
 604         /*
 605          * Estimate the number of tuples to be processed in the mergejoin
 606          * itself as one per tuple in the two source relations.  This could be
 607          * a drastic underestimate if there are many equal-keyed tuples in
 608          * either relation, but we have no good way of estimating that...
 609          */
 610         ntuples = outer_path->parent->rows + inner_path->parent->rows;
 611
 612         /* CPU costs */
 613         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 614         run_cost += cpu_per_tuple * ntuples;
 615
 616         path->startup_cost = startup_cost;
 617         path->total_cost = startup_cost + run_cost;
 618 }
 619
 620 /*
 621  * cost_hashjoin
 622  *        Determines and returns the cost of joining two relations using the
 623  *        hash join algorithm.
 624  *
 625  * 'outer_path' is the path for the outer relation
 626  * 'inner_path' is the path for the inner relation
 627  * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 628  * 'innerbucketsize' is an estimate of the bucketsize statistic
 629  *                              for the inner hash key.
 630  */
 631 void
 632 cost_hashjoin(Path *path,
 633                           Path *outer_path,
 634                           Path *inner_path,
 635                           List *restrictlist,
 636                           Selectivity innerbucketsize)
 637 {
 638         Cost            startup_cost = 0;
 639         Cost            run_cost = 0;
 640         Cost            cpu_per_tuple;
 641         double          ntuples;
 642         double          outerbytes = relation_byte_size(outer_path->parent->rows,
 643                                                                                           outer_path->parent->width);
 644         double          innerbytes = relation_byte_size(inner_path->parent->rows,
 645                                                                                           inner_path->parent->width);
 646         long            hashtablebytes = SortMem * 1024L;
 647
 648         if (!enable_hashjoin)
 649                 startup_cost += disable_cost;
 650
 651         /* cost of source data */
 652         startup_cost += outer_path->startup_cost;
 653         run_cost += outer_path->total_cost - outer_path->startup_cost;
 654         startup_cost += inner_path->total_cost;
 655
 656         /* cost of computing hash function: must do it once per input tuple */
 657         startup_cost += cpu_operator_cost * inner_path->parent->rows;
 658         run_cost += cpu_operator_cost * outer_path->parent->rows;
 659
 660         /*
 661          * The number of tuple comparisons needed is the number of outer
 662          * tuples times the typical number of tuples in a hash bucket,
 663          * which is the inner relation size times its bucketsize fraction.
 664          * We charge one cpu_operator_cost per tuple comparison.
 665          */
 666         run_cost += cpu_operator_cost * outer_path->parent->rows *
 667                 ceil(inner_path->parent->rows * innerbucketsize);
 668
 669         /*
 670          * Estimate the number of tuples that get through the hashing filter
 671          * as one per tuple in the two source relations.  This could be a
 672          * drastic underestimate if there are many equal-keyed tuples in
 673          * either relation, but we have no simple way of estimating that;
 674          * and since this is only a second-order parameter, it's probably
 675          * not worth expending a lot of effort on the estimate.
 676          */
 677         ntuples = outer_path->parent->rows + inner_path->parent->rows;
 678
 679         /* CPU costs */
 680         cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
 681         run_cost += cpu_per_tuple * ntuples;
 682
 683         /*
 684          * if inner relation is too big then we will need to "batch" the join,
 685          * which implies writing and reading most of the tuples to disk an
 686          * extra time.  Charge one cost unit per page of I/O (correct since it
 687          * should be nice and sequential...).  Writing the inner rel counts as
 688          * startup cost, all the rest as run cost.
 689          */
 690         if (innerbytes > hashtablebytes)
 691         {
 692                 double          outerpages = page_size(outer_path->parent->rows,
 693                                                                                    outer_path->parent->width);
 694                 double          innerpages = page_size(inner_path->parent->rows,
 695                                                                                    inner_path->parent->width);
 696
 697                 startup_cost += innerpages;
 698                 run_cost += innerpages + 2 * outerpages;
 699         }
 700
 701         /*
 702          * Bias against putting larger relation on inside.      We don't want an
 703          * absolute prohibition, though, since larger relation might have
 704          * better bucketsize --- and we can't trust the size estimates
 705          * unreservedly, anyway.  Instead, inflate the startup cost by the
 706          * square root of the size ratio.  (Why square root?  No real good
 707          * reason, but it seems reasonable...)
 708          */
 709         if (innerbytes > outerbytes && outerbytes > 0)
 710                 startup_cost *= sqrt(innerbytes / outerbytes);
 711
 712         path->startup_cost = startup_cost;
 713         path->total_cost = startup_cost + run_cost;
 714 }
 715
 716 /*
 717  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
 718  * divided by total tuples in relation) if the specified Var is used
 719  * as a hash key.
 720  *
 721  * This statistic is used by cost_hashjoin.  We split out the calculation
 722  * because it's useful to cache the result for re-use across multiple path
 723  * cost calculations.
 724  *
 725  * XXX This is really pretty bogus since we're effectively assuming that the
 726  * distribution of hash keys will be the same after applying restriction
 727  * clauses as it was in the underlying relation.  However, we are not nearly
 728  * smart enough to figure out how the restrict clauses might change the
 729  * distribution, so this will have to do for now.
 730  *
 731  * The executor tries for average bucket loading of NTUP_PER_BUCKET by setting
 732  * number of buckets equal to ntuples / NTUP_PER_BUCKET, which would yield
 733  * a bucketsize fraction of NTUP_PER_BUCKET / ntuples.  But that goal will
 734  * be reached only if the data values are uniformly distributed among the
 735  * buckets, which requires (a) at least ntuples / NTUP_PER_BUCKET distinct
 736  * data values, and (b) a not-too-skewed data distribution.  Otherwise the
 737  * buckets will be nonuniformly occupied.  If the other relation in the join
 738  * has a similar distribution, the most-loaded buckets are exactly those
 739  * that will be probed most often.  Therefore, the "average" bucket size for
 740  * costing purposes should really be taken as something close to the "worst
 741  * case" bucket size.  We try to estimate this by first scaling up if there
 742  * are too few distinct data values, and then scaling up again by the
 743  * ratio of the most common value's frequency to the average frequency.
 744  *
 745  * If no statistics are available, use a default estimate of 0.1.  This will
 746  * discourage use of a hash rather strongly if the inner relation is large,
 747  * which is what we want.  We do not want to hash unless we know that the
 748  * inner rel is well-dispersed (or the alternatives seem much worse).
 749  */
 750 Selectivity
 751 estimate_hash_bucketsize(Query *root, Var *var)
 752 {
 753         Oid                     relid;
 754         RelOptInfo *rel;
 755         HeapTuple       tuple;
 756         Form_pg_statistic stats;
 757         double          estfract,
 758                                 ndistinct,
 759                                 needdistinct,
 760                                 mcvfreq,
 761                                 avgfreq;
 762         float4     *numbers;
 763         int                     nnumbers;
 764
 765         /*
 766          * Lookup info about var's relation and attribute;
 767          * if none available, return default estimate.
 768          */
 769         if (!IsA(var, Var))
 770                 return 0.1;
 771
 772         relid = getrelid(var->varno, root->rtable);
 773         if (relid == InvalidOid)
 774                 return 0.1;
 775
 776         rel = find_base_rel(root, var->varno);
 777
 778         if (rel->tuples <= 0.0 || rel->rows <= 0.0)
 779                 return 0.1;                             /* ensure we can divide below */
 780
 781         tuple = SearchSysCache(STATRELATT,
 782                                                    ObjectIdGetDatum(relid),
 783                                                    Int16GetDatum(var->varattno),
 784                                                    0, 0);
 785         if (!HeapTupleIsValid(tuple))
 786         {
 787                 /*
 788                  * Perhaps the Var is a system attribute; if so, it will have no
 789                  * entry in pg_statistic, but we may be able to guess something
 790                  * about its distribution anyway.
 791                  */
 792                 switch (var->varattno)
 793                 {
 794                         case ObjectIdAttributeNumber:
 795                         case SelfItemPointerAttributeNumber:
 796                                 /* these are unique, so buckets should be well-distributed */
 797                                 return (double) NTUP_PER_BUCKET / rel->rows;
 798                         case TableOidAttributeNumber:
 799                                 /* hashing this is a terrible idea... */
 800                                 return 1.0;
 801                 }
 802                 return 0.1;
 803         }
 804         stats = (Form_pg_statistic) GETSTRUCT(tuple);
 805
 806         /*
 807          * Obtain number of distinct data values in raw relation.
 808          */
 809         ndistinct = stats->stadistinct;
 810         if (ndistinct < 0.0)
 811                 ndistinct = -ndistinct * rel->tuples;
 812
 813         /*
 814          * Adjust ndistinct to account for restriction clauses.  Observe we are
 815          * assuming that the data distribution is affected uniformly by the
 816          * restriction clauses!
 817          *
 818          * XXX Possibly better way, but much more expensive: multiply by
 819          * selectivity of rel's restriction clauses that mention the target Var.
 820          */
 821         ndistinct *= rel->rows / rel->tuples;
 822
 823         /*
 824          * Discourage use of hash join if there seem not to be very many distinct
 825          * data values.  The threshold here is somewhat arbitrary, as is the
 826          * fraction used to "discourage" the choice.
 827          */
 828         if (ndistinct < 50.0)
 829         {
 830                 ReleaseSysCache(tuple);
 831                 return 0.5;
 832         }
 833
 834         /*
 835          * Form initial estimate of bucketsize fraction.  Here we use rel->rows,
 836          * ie the number of rows after applying restriction clauses, because
 837          * that's what the fraction will eventually be multiplied by in
 838          * cost_heapjoin.
 839          */
 840         estfract = (double) NTUP_PER_BUCKET / rel->rows;
 841
 842         /*
 843          * Adjust estimated bucketsize if too few distinct values to fill
 844          * all the buckets.
 845          */
 846         needdistinct = rel->rows / (double) NTUP_PER_BUCKET;
 847         if (ndistinct < needdistinct)
 848                 estfract *= needdistinct / ndistinct;
 849
 850         /*
 851          * Look up the frequency of the most common value, if available.
 852          */
 853         mcvfreq = 0.0;
 854
 855         if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
 856                                                  STATISTIC_KIND_MCV, InvalidOid,
 857                                                  NULL, NULL, &numbers, &nnumbers))
 858         {
 859                 /*
 860                  * The first MCV stat is for the most common value.
 861                  */
 862                 if (nnumbers > 0)
 863                         mcvfreq = numbers[0];
 864                 free_attstatsslot(var->vartype, NULL, 0,
 865                                                   numbers, nnumbers);
 866         }
 867
 868         /*
 869          * Adjust estimated bucketsize upward to account for skewed distribution.
 870          */
 871         avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
 872
 873         if (avgfreq > 0.0 && mcvfreq > avgfreq)
 874                 estfract *= mcvfreq / avgfreq;
 875
 876         ReleaseSysCache(tuple);
 877
 878         return (Selectivity) estfract;
 879 }
 880
 881
 882 /*
 883  * cost_qual_eval
 884  *              Estimate the CPU cost of evaluating a WHERE clause (once).
 885  *              The input can be either an implicitly-ANDed list of boolean
 886  *              expressions, or a list of RestrictInfo nodes.
 887  */
 888 Cost
 889 cost_qual_eval(List *quals)
 890 {
 891         Cost            total = 0;
 892         List       *l;
 893
 894         /* We don't charge any cost for the implicit ANDing at top level ... */
 895
 896         foreach(l, quals)
 897         {
 898                 Node       *qual = (Node *) lfirst(l);
 899
 900                 /*
 901                  * RestrictInfo nodes contain an eval_cost field reserved for this
 902                  * routine's use, so that it's not necessary to evaluate the qual
 903                  * clause's cost more than once.  If the clause's cost hasn't been
 904                  * computed yet, the field will contain -1.
 905                  */
 906                 if (qual && IsA(qual, RestrictInfo))
 907                 {
 908                         RestrictInfo *restrictinfo = (RestrictInfo *) qual;
 909
 910                         if (restrictinfo->eval_cost < 0)
 911                         {
 912                                 restrictinfo->eval_cost = 0;
 913                                 cost_qual_eval_walker((Node *) restrictinfo->clause,
 914                                                                           &restrictinfo->eval_cost);
 915                         }
 916                         total += restrictinfo->eval_cost;
 917                 }
 918                 else
 919                 {
 920                         /* If it's a bare expression, must always do it the hard way */
 921                         cost_qual_eval_walker(qual, &total);
 922                 }
 923         }
 924         return total;
 925 }
 926
 927 static bool
 928 cost_qual_eval_walker(Node *node, Cost *total)
 929 {
 930         if (node == NULL)
 931                 return false;
 932
 933         /*
 934          * Our basic strategy is to charge one cpu_operator_cost for each
 935          * operator or function node in the given tree.  Vars and Consts are
 936          * charged zero, and so are boolean operators (AND, OR, NOT).
 937          * Simplistic, but a lot better than no model at all.
 938          *
 939          * Should we try to account for the possibility of short-circuit
 940          * evaluation of AND/OR?
 941          */
 942         if (IsA(node, Expr))
 943         {
 944                 Expr       *expr = (Expr *) node;
 945
 946                 switch (expr->opType)
 947                 {
 948                         case OP_EXPR:
 949                         case FUNC_EXPR:
 950                                 *total += cpu_operator_cost;
 951                                 break;
 952                         case OR_EXPR:
 953                         case AND_EXPR:
 954                         case NOT_EXPR:
 955                                 break;
 956                         case SUBPLAN_EXPR:
 957
 958                                 /*
 959                                  * A subplan node in an expression indicates that the
 960                                  * subplan will be executed on each evaluation, so charge
 961                                  * accordingly. (We assume that sub-selects that can be
 962                                  * executed as InitPlans have already been removed from
 963                                  * the expression.)
 964                                  *
 965                                  * NOTE: this logic should agree with the estimates used by
 966                                  * make_subplan() in plan/subselect.c.
 967                                  */
 968                                 {
 969                                         SubPlan    *subplan = (SubPlan *) expr->oper;
 970                                         Plan       *plan = subplan->plan;
 971                                         Cost            subcost;
 972
 973                                         if (subplan->sublink->subLinkType == EXISTS_SUBLINK)
 974                                         {
 975                                                 /* we only need to fetch 1 tuple */
 976                                                 subcost = plan->startup_cost +
 977                                                         (plan->total_cost - plan->startup_cost) / plan->plan_rows;
 978                                         }
 979                                         else if (subplan->sublink->subLinkType == ALL_SUBLINK ||
 980                                                          subplan->sublink->subLinkType == ANY_SUBLINK)
 981                                         {
 982                                                 /* assume we need 50% of the tuples */
 983                                                 subcost = plan->startup_cost +
 984                                                         0.50 * (plan->total_cost - plan->startup_cost);
 985                                                 /* XXX what if subplan has been materialized? */
 986                                         }
 987                                         else
 988                                         {
 989                                                 /* assume we need all tuples */
 990                                                 subcost = plan->total_cost;
 991                                         }
 992                                         *total += subcost;
 993                                 }
 994                                 break;
 995                 }
 996                 /* fall through to examine args of Expr node */
 997         }
 998         return expression_tree_walker(node, cost_qual_eval_walker,
 999                                                                   (void *) total);
1000 }
1001
1002
1003 /*
1004  * set_baserel_size_estimates
1005  *              Set the size estimates for the given base relation.
1006  *
1007  * The rel's targetlist and restrictinfo list must have been constructed
1008  * already.
1009  *
1010  * We set the following fields of the rel node:
1011  *      rows: the estimated number of output tuples (after applying
1012  *                restriction clauses).
1013  *      width: the estimated average output tuple width in bytes.
1014  *      baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
1015  */
1016 void
1017 set_baserel_size_estimates(Query *root, RelOptInfo *rel)
1018 {
1019         /* Should only be applied to base relations */
1020         Assert(length(rel->relids) == 1);
1021
1022         rel->rows = rel->tuples *
1023                 restrictlist_selectivity(root,
1024                                                                  rel->baserestrictinfo,
1025                                                                  lfirsti(rel->relids));
1026
1027         /*
1028          * Force estimate to be at least one row, to make explain output look
1029          * better and to avoid possible divide-by-zero when interpolating
1030          * cost.
1031          */
1032         if (rel->rows < 1.0)
1033                 rel->rows = 1.0;
1034
1035         rel->baserestrictcost = cost_qual_eval(rel->baserestrictinfo);
1036
1037         set_rel_width(root, rel);
1038 }
1039
1040 /*
1041  * set_joinrel_size_estimates
1042  *              Set the size estimates for the given join relation.
1043  *
1044  * The rel's targetlist must have been constructed already, and a
1045  * restriction clause list that matches the given component rels must
1046  * be provided.
1047  *
1048  * Since there is more than one way to make a joinrel for more than two
1049  * base relations, the results we get here could depend on which component
1050  * rel pair is provided.  In theory we should get the same answers no matter
1051  * which pair is provided; in practice, since the selectivity estimation
1052  * routines don't handle all cases equally well, we might not.  But there's
1053  * not much to be done about it.  (Would it make sense to repeat the
1054  * calculations for each pair of input rels that's encountered, and somehow
1055  * average the results?  Probably way more trouble than it's worth.)
1056  *
1057  * We set the same relnode fields as set_baserel_size_estimates() does.
1058  */
1059 void
1060 set_joinrel_size_estimates(Query *root, RelOptInfo *rel,
1061                                                    RelOptInfo *outer_rel,
1062                                                    RelOptInfo *inner_rel,
1063                                                    JoinType jointype,
1064                                                    List *restrictlist)
1065 {
1066         double          temp;
1067
1068         /* Start with the Cartesian product */
1069         temp = outer_rel->rows * inner_rel->rows;
1070
1071         /*
1072          * Apply join restrictivity.  Note that we are only considering
1073          * clauses that become restriction clauses at this join level; we are
1074          * not double-counting them because they were not considered in
1075          * estimating the sizes of the component rels.
1076          */
1077         temp *= restrictlist_selectivity(root,
1078                                                                          restrictlist,
1079                                                                          0);
1080
1081         /*
1082          * If we are doing an outer join, take that into account: the output
1083          * must be at least as large as the non-nullable input.  (Is there any
1084          * chance of being even smarter?)
1085          */
1086         switch (jointype)
1087         {
1088                 case JOIN_INNER:
1089                         break;
1090                 case JOIN_LEFT:
1091                         if (temp < outer_rel->rows)
1092                                 temp = outer_rel->rows;
1093                         break;
1094                 case JOIN_RIGHT:
1095                         if (temp < inner_rel->rows)
1096                                 temp = inner_rel->rows;
1097                         break;
1098                 case JOIN_FULL:
1099                         if (temp < outer_rel->rows)
1100                                 temp = outer_rel->rows;
1101                         if (temp < inner_rel->rows)
1102                                 temp = inner_rel->rows;
1103                         break;
1104                 default:
1105                         elog(ERROR, "set_joinrel_size_estimates: unsupported join type %d",
1106                                  (int) jointype);
1107                         break;
1108         }
1109
1110         /*
1111          * Force estimate to be at least one row, to make explain output look
1112          * better and to avoid possible divide-by-zero when interpolating
1113          * cost.
1114          */
1115         if (temp < 1.0)
1116                 temp = 1.0;
1117
1118         rel->rows = temp;
1119
1120         /*
1121          * We could apply set_rel_width() to compute the output tuple width
1122          * from scratch, but at present it's always just the sum of the input
1123          * widths, so why work harder than necessary?  If relnode.c is ever
1124          * taught to remove unneeded columns from join targetlists, go back to
1125          * using set_rel_width here.
1126          */
1127         rel->width = outer_rel->width + inner_rel->width;
1128 }
1129
1130 /*
1131  * set_rel_width
1132  *              Set the estimated output width of the relation.
1133  *
1134  * NB: this works best on base relations because it prefers to look at
1135  * real Vars.  It will fail to make use of pg_statistic info when applied
1136  * to a subquery relation, even if the subquery outputs are simple vars
1137  * that we could have gotten info for.  Is it worth trying to be smarter
1138  * about subqueries?
1139  */
1140 static void
1141 set_rel_width(Query *root, RelOptInfo *rel)
1142 {
1143         int32           tuple_width = 0;
1144         List       *tllist;
1145
1146         foreach(tllist, rel->targetlist)
1147         {
1148                 TargetEntry *tle = (TargetEntry *) lfirst(tllist);
1149                 int32   item_width;
1150
1151                 /*
1152                  * If it's a Var, try to get statistical info from pg_statistic.
1153                  */
1154                 if (tle->expr && IsA(tle->expr, Var))
1155                 {
1156                         Var        *var = (Var *) tle->expr;
1157                         Oid             relid;
1158
1159                         relid = getrelid(var->varno, root->rtable);
1160                         if (relid != InvalidOid)
1161                         {
1162                                 item_width = get_attavgwidth(relid, var->varattno);
1163                                 if (item_width > 0)
1164                                 {
1165                                         tuple_width += item_width;
1166                                         continue;
1167                                 }
1168                         }
1169                 }
1170                 /*
1171                  * Not a Var, or can't find statistics for it.  Estimate using
1172                  * just the type info.
1173                  */
1174                 item_width = get_typavgwidth(tle->resdom->restype,
1175                                                                          tle->resdom->restypmod);
1176                 Assert(item_width > 0);
1177                 tuple_width += item_width;
1178         }
1179         Assert(tuple_width >= 0);
1180         rel->width = tuple_width;
1181 }
1182
1183 /*
1184  * relation_byte_size
1185  *        Estimate the storage space in bytes for a given number of tuples
1186  *        of a given width (size in bytes).
1187  */
1188 static double
1189 relation_byte_size(double tuples, int width)
1190 {
1191         return tuples * ((double) MAXALIGN(width + sizeof(HeapTupleData)));
1192 }
1193
1194 /*
1195  * page_size
1196  *        Returns an estimate of the number of pages covered by a given
1197  *        number of tuples of a given width (size in bytes).
1198  */
1199 static double
1200 page_size(double tuples, int width)
1201 {
1202         return ceil(relation_byte_size(tuples, width) / BLCKSZ);
1203 }