From 8a30cc212745681e5328c030f8fc5d210d733b92 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 6 Jun 2006 17:59:58 +0000
Subject: [PATCH] Make the planner estimate costs for nestloop inner indexscans
 on the basis that the Mackert-Lohmann formula applies across all the
 repetitions of the nestloop, not just each scan independently.  We use the
 M-L formula to estimate the number of pages fetched from the index as well as
 from the table; that isn't what it was designed for, but it seems reasonably
 applicable anyway.  This makes large numbers of repetitions look much cheaper
 than before, which accords with many reports we've received of overestimation
 of the cost of a nestloop.  Also, change the index access cost model to
 charge random_page_cost per index leaf page touched, while explicitly not
 counting anything for access to metapage or upper tree pages.  This may all
 need tweaking after we get some field experience, but in simple tests it
 seems to be giving saner results than before.  The main thing is to get the
 infrastructure in place to let cost_index() and amcostestimate functions take
 repeated scans into account at all.  Per my recent proposal.

Note: this patch changes pg_proc.h, but I did not force initdb because
the changes are basically cosmetic --- the system does not look into
pg_proc to decide how to call an index amcostestimate function, and
there's no way to call such a function from SQL at all.
---
 doc/src/sgml/indexam.sgml               |  26 ++-
 src/backend/optimizer/path/costsize.c   | 223 ++++++++++++++++--------
 src/backend/optimizer/path/indxpath.c   |  86 +++++----
 src/backend/optimizer/path/joinpath.c   |  16 +-
 src/backend/optimizer/path/orindxpath.c |   6 +-
 src/backend/optimizer/plan/planagg.c    |   4 +-
 src/backend/optimizer/util/pathnode.c   |  21 ++-
 src/backend/utils/adt/selfuncs.c        | 108 ++++++++----
 src/include/catalog/pg_proc.h           |  10 +-
 src/include/nodes/relation.h            |   5 +-
 src/include/optimizer/cost.h            |  10 +-
 src/include/optimizer/pathnode.h        |   4 +-
 src/include/optimizer/paths.h           |   7 +-
 src/include/pg_config_manual.h          |   4 +-
 14 files changed, 334 insertions(+), 196 deletions(-)
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 4bf14ba7e6..a001cd4e33 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/indexam.sgml,v 2.13 2006/06/05 02:49:58 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/indexam.sgml,v 2.14 2006/06/06 17:59:57 tgl Exp $ -->
 
 <chapter id="indexam">
  <title>Index Access Method Interface Definition</title>
@@ -344,6 +344,7 @@ void
 amcostestimate (PlannerInfo *root,
                 IndexOptInfo *index,
                 List *indexQuals,
+                RelOptInfo *outer_rel,
                 Cost *indexStartupCost,
                 Cost *indexTotalCost,
                 Selectivity *indexSelectivity,
@@ -681,6 +682,7 @@ void
 amcostestimate (PlannerInfo *root,
                 IndexOptInfo *index,
                 List *indexQuals,
+                RelOptInfo *outer_rel,
                 Cost *indexStartupCost,
                 Cost *indexTotalCost,
                 Selectivity *indexSelectivity,
@@ -718,6 +720,20 @@ amcostestimate (PlannerInfo *root,
       </para>
      </listitem>
     </varlistentry>
+
+    <varlistentry>
+     <term>outer_rel</term>
+     <listitem>
+      <para>
+       If the index is being considered for use in a join inner indexscan,
+       the planner's information about the outer side of the join.  Otherwise
+       NULL.  When non-NULL, some of the qual clauses will be join clauses
+       with this rel rather than being simple restriction clauses.  Also,
+       the cost estimator should expect that the index scan will be repeated
+       for each row of the outer rel.
+      </para>
+     </listitem>
+    </varlistentry>
    </variablelist>
   </para>
 
@@ -808,6 +824,11 @@ amcostestimate (PlannerInfo *root,
    table.
   </para>
 
+  <para>
+   In the join case, the returned numbers should be averages expected for
+   any one scan of the index.
+  </para>
+
   <procedure>
    <title>Cost Estimation</title>
    <para>
@@ -859,6 +880,9 @@ amcostestimate (PlannerInfo *root,
     *indexTotalCost = seq_page_cost * numIndexPages +
         (cpu_index_tuple_cost + index_qual_cost.per_tuple) * numIndexTuples;
 </programlisting>
+
+     However, the above does not account for amortization of index reads
+     across repeated index scans in the join case.
     </para>
    </step>
 
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 965574c579..fa8d06707b 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.157 2006/06/05 20:56:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.158 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -133,7 +133,7 @@ clamp_row_est(double nrows)
 	 * better and to avoid possible divide-by-zero when interpolating costs.
 	 * Make it an integer, too.
 	 */
-	if (nrows < 1.0)
+	if (nrows <= 1.0)
 		nrows = 1.0;
 	else
 		nrows = rint(nrows);
@@ -181,8 +181,10 @@ cost_seqscan(Path *path, PlannerInfo *root,
  *
  * 'index' is the index to be used
  * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics)
- * 'is_injoin' is T if we are considering using the index scan as the inside
- *		of a nestloop join (hence, some of the indexQuals are join clauses)
+ * 'outer_rel' is the outer relation when we are considering using the index
+ *		scan as the inside of a nestloop join (hence, some of the indexQuals
+ *		are join clauses, and we should expect repeated scans of the index);
+ *		NULL for a plain index scan
  *
  * cost_index() takes an IndexPath not just a Path, because it sets a few
  * additional fields of the IndexPath besides startup_cost and total_cost.
@@ -200,7 +202,7 @@ void
 cost_index(IndexPath *path, PlannerInfo *root,
 		   IndexOptInfo *index,
 		   List *indexQuals,
-		   bool is_injoin)
+		   RelOptInfo *outer_rel)
 {
 	RelOptInfo *baserel = index->rel;
 	Cost		startup_cost = 0;
@@ -215,8 +217,6 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	Cost		cpu_per_tuple;
 	double		tuples_fetched;
 	double		pages_fetched;
-	double		T,
-				b;
 
 	/* Should only be applied to base relations */
 	Assert(IsA(baserel, RelOptInfo) &&
@@ -233,10 +233,11 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	 * the fraction of main-table tuples we will have to retrieve) and its
 	 * correlation to the main-table tuple order.
 	 */
-	OidFunctionCall7(index->amcostestimate,
+	OidFunctionCall8(index->amcostestimate,
 					 PointerGetDatum(root),
 					 PointerGetDatum(index),
 					 PointerGetDatum(indexQuals),
+					 PointerGetDatum(outer_rel),
 					 PointerGetDatum(&indexStartupCost),
 					 PointerGetDatum(&indexTotalCost),
 					 PointerGetDatum(&indexSelectivity),
@@ -254,86 +255,75 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	startup_cost += indexStartupCost;
 	run_cost += indexTotalCost - indexStartupCost;
 
+	/* estimate number of main-table tuples fetched */
+	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
+
 	/*----------
-	 * Estimate number of main-table tuples and pages fetched.
+	 * Estimate number of main-table pages fetched, and compute I/O cost.
 	 *
 	 * When the index ordering is uncorrelated with the table ordering,
-	 * we use an approximation proposed by Mackert and Lohman, "Index Scans
-	 * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
-	 * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
-	 * The Mackert and Lohman approximation is that the number of pages
-	 * fetched is
-	 *	PF =
-	 *		min(2TNs/(2T+Ns), T)			when T <= b
-	 *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
-	 *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
-	 * where
-	 *		T = # pages in table
-	 *		N = # tuples in table
-	 *		s = selectivity = fraction of table to be scanned
-	 *		b = # buffer pages available (we include kernel space here)
+	 * we use an approximation proposed by Mackert and Lohman (see
+	 * index_pages_fetched() for details) to compute the number of pages
+	 * fetched, and then charge random_page_cost per page fetched.
 	 *
 	 * When the index ordering is exactly correlated with the table ordering
 	 * (just after a CLUSTER, for example), the number of pages fetched should
-	 * be just sT.	What's more, these will be sequential fetches, not the
-	 * random fetches that occur in the uncorrelated case.	So, depending on
-	 * the extent of correlation, we should estimate the actual I/O cost
-	 * somewhere between s * T * seq_page_cost and PF * random_page_cost.
-	 * We currently interpolate linearly between these two endpoints based on
-	 * the correlation squared (XXX is that appropriate?).
-	 *
-	 * In any case the number of tuples fetched is Ns.
+	 * be exactly selectivity * table_size.  What's more, all but the first
+	 * will be sequential fetches, not the random fetches that occur in the
+	 * uncorrelated case.  So if the number of pages is more than 1, we
+	 * ought to charge
+	 *		random_page_cost + (pages_fetched - 1) * seq_page_cost
+	 * For partially-correlated indexes, we ought to charge somewhere between
+	 * these two estimates.  We currently interpolate linearly between the
+	 * estimates based on the correlation squared (XXX is that appropriate?).
 	 *----------
 	 */
+	if (outer_rel != NULL && outer_rel->rows > 1)
+	{
+		/*
+		 * For repeated indexscans, scale up the number of tuples fetched
+		 * in the Mackert and Lohman formula by the number of scans, so
+		 * that we estimate the number of pages fetched by all the scans.
+		 * Then pro-rate the costs for one scan.  In this case we assume
+		 * all the fetches are random accesses.  XXX it'd be good to
+		 * include correlation in this model, but it's not clear how to do
+		 * that without double-counting cache effects.
+		 */
+		double		num_scans = outer_rel->rows;
 
-	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
-
-	/* This part is the Mackert and Lohman formula */
+		pages_fetched = index_pages_fetched(tuples_fetched * num_scans,
+											baserel->pages,
+											index->pages);
 
-	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
-	b = (effective_cache_size > 1) ? effective_cache_size : 1.0;
-
-	if (T <= b)
-	{
-		pages_fetched =
-			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
-		if (pages_fetched >= T)
-			pages_fetched = T;
-		else
-			pages_fetched = ceil(pages_fetched);
+		run_cost += (pages_fetched * random_page_cost) / num_scans;
 	}
 	else
 	{
-		double		lim;
+		/*
+		 * Normal case: apply the Mackert and Lohman formula, and then
+		 * interpolate between that and the correlation-derived result.
+		 */
+		pages_fetched = index_pages_fetched(tuples_fetched,
+											baserel->pages,
+											index->pages);
 
-		lim = (2.0 * T * b) / (2.0 * T - b);
-		if (tuples_fetched <= lim)
-		{
-			pages_fetched =
-				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
-		}
-		else
-		{
-			pages_fetched =
-				b + (tuples_fetched - lim) * (T - b) / T;
-		}
-		pages_fetched = ceil(pages_fetched);
-	}
+		/* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
+		max_IO_cost = pages_fetched * random_page_cost;
 
-	/*
-	 * min_IO_cost corresponds to the perfectly correlated case (csquared=1),
-	 * max_IO_cost to the perfectly uncorrelated case (csquared=0).
-	 */
-	min_IO_cost = ceil(indexSelectivity * T) * seq_page_cost;
-	max_IO_cost = pages_fetched * random_page_cost;
+		/* min_IO_cost is for the perfectly correlated case (csquared=1) */
+		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
+		min_IO_cost = random_page_cost;
+		if (pages_fetched > 1)
+			min_IO_cost += (pages_fetched - 1) * seq_page_cost;
 
-	/*
-	 * Now interpolate based on estimated index order correlation to get total
-	 * disk I/O cost for main table accesses.
-	 */
-	csquared = indexCorrelation * indexCorrelation;
+		/*
+		 * Now interpolate based on estimated index order correlation to get
+		 * total disk I/O cost for main table accesses.
+		 */
+		csquared = indexCorrelation * indexCorrelation;
 
-	run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
+		run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
+	}
 
 	/*
 	 * Estimate CPU costs per tuple.
@@ -348,7 +338,7 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	startup_cost += baserel->baserestrictcost.startup;
 	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
 
-	if (!is_injoin)
+	if (outer_rel == NULL)
 	{
 		QualCost	index_qual_cost;
 
@@ -363,6 +353,88 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	path->path.total_cost = startup_cost + run_cost;
 }
 
+/*
+ * index_pages_fetched
+ *	  Estimate the number of pages actually fetched after accounting for
+ *	  cache effects.
+ *
+ * We use an approximation proposed by Mackert and Lohman, "Index Scans
+ * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
+ * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
+ * The Mackert and Lohman approximation is that the number of pages
+ * fetched is
+ *	PF =
+ *		min(2TNs/(2T+Ns), T)			when T <= b
+ *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
+ *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
+ * where
+ *		T = # pages in table
+ *		N = # tuples in table
+ *		s = selectivity = fraction of table to be scanned
+ *		b = # buffer pages available (we include kernel space here)
+ *
+ * We assume that effective_cache_size is the total number of buffer pages
+ * available for both table and index, and pro-rate that space between the
+ * table and index.  (Ideally other_pages should include all the other
+ * tables and indexes used by the query too; but we don't have a good way
+ * to get that number here.)
+ *
+ * The product Ns is the number of tuples fetched; we pass in that
+ * product rather than calculating it here.
+ *
+ * Caller is expected to have ensured that tuples_fetched is greater than zero
+ * and rounded to integer (see clamp_row_est).  The result will likewise be
+ * greater than zero and integral.
+ */
+double
+index_pages_fetched(double tuples_fetched, BlockNumber pages,
+					BlockNumber other_pages)
+{
+	double		pages_fetched;
+	double		T,
+				b;
+
+	/* T is # pages in table, but don't allow it to be zero */
+	T = (pages > 1) ? (double) pages : 1.0;
+
+	/* b is pro-rated share of effective_cache_size */
+	b = effective_cache_size * T / (T + (double) other_pages);
+	/* force it positive and integral */
+	if (b <= 1.0)
+		b = 1.0;
+	else
+		b = ceil(b);
+
+	/* This part is the Mackert and Lohman formula */
+	if (T <= b)
+	{
+		pages_fetched =
+			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+		if (pages_fetched >= T)
+			pages_fetched = T;
+		else
+			pages_fetched = ceil(pages_fetched);
+	}
+	else
+	{
+		double		lim;
+
+		lim = (2.0 * T * b) / (2.0 * T - b);
+		if (tuples_fetched <= lim)
+		{
+			pages_fetched =
+				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+		}
+		else
+		{
+			pages_fetched =
+				b + (tuples_fetched - lim) * (T - b) / T;
+		}
+		pages_fetched = ceil(pages_fetched);
+	}
+	return pages_fetched;
+}
+
 /*
  * cost_bitmap_heap_scan
  *	  Determines and returns the cost of scanning a relation using a bitmap
@@ -370,12 +442,13 @@ cost_index(IndexPath *path, PlannerInfo *root,
  *
  * 'baserel' is the relation to be scanned
  * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
- * 'is_injoin' is T if we are considering using the scan as the inside
- *		of a nestloop join (hence, some of the quals are join clauses)
+ *
+ * Note: we take no explicit notice here of whether this is a join inner path.
+ * If it is, the component IndexPaths should have been costed accordingly.
  */
 void
 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
-					  Path *bitmapqual, bool is_injoin)
+					  Path *bitmapqual)
 {
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index ac6de3d82a..eaf4e1cb33 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.206 2006/05/18 19:56:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.207 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -47,13 +47,11 @@
 
 static List *find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 					List *clauses, List *outer_clauses,
-					bool istoplevel, bool isjoininner,
-					Relids outer_relids,
+					bool istoplevel, RelOptInfo *outer_rel,
 					SaOpControl saop_control);
 static List *find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
 				List *clauses, List *outer_clauses,
-				bool istoplevel, bool isjoininner,
-				Relids outer_relids);
+				bool istoplevel, RelOptInfo *outer_rel);
 static Path *choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel, List *paths);
 static int	bitmap_path_comparator(const void *a, const void *b);
 static Cost bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths);
@@ -164,7 +162,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 	 */
 	indexpaths = find_usable_indexes(root, rel,
 									 rel->baserestrictinfo, NIL,
-									 true, false, NULL, SAOP_FORBID);
+									 true, NULL, SAOP_FORBID);
 
 	/*
 	 * We can submit them all to add_path.	(This generates access paths for
@@ -190,7 +188,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 	 */
 	indexpaths = generate_bitmap_or_paths(root, rel,
 										  rel->baserestrictinfo, NIL,
-										  false, NULL);
+										  NULL);
 	bitindexpaths = list_concat(bitindexpaths, indexpaths);
 
 	/*
@@ -199,7 +197,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 	 */
 	indexpaths = find_saop_paths(root, rel,
 								 rel->baserestrictinfo, NIL,
-								 true, false, NULL);
+								 true, NULL);
 	bitindexpaths = list_concat(bitindexpaths, indexpaths);
 
 	/*
@@ -247,10 +245,8 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
  * 'outer_clauses' is the list of additional upper-level clauses
  * 'istoplevel' is true if clauses are the rel's top-level restriction list
  *		(outer_clauses must be NIL when this is true)
- * 'isjoininner' is true if forming an inner indexscan (so some of the
- *		given clauses are join clauses)
- * 'outer_relids' identifies the outer side of the join (pass NULL
- *		if not isjoininner)
+ * 'outer_rel' is the outer side of the join if forming an inner indexscan
+ *		(so some of the given clauses are join clauses); NULL if not
  * 'saop_control' indicates whether ScalarArrayOpExpr clauses can be used
  *
  * Note: check_partial_indexes() must have been run previously.
@@ -259,10 +255,10 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 static List *
 find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 					List *clauses, List *outer_clauses,
-					bool istoplevel, bool isjoininner,
-					Relids outer_relids,
+					bool istoplevel, RelOptInfo *outer_rel,
 					SaOpControl saop_control)
 {
+	Relids		outer_relids = outer_rel ? outer_rel->relids : NULL;
 	List	   *result = NIL;
 	List	   *all_clauses = NIL;		/* not computed till needed */
 	ListCell   *ilist;
@@ -349,7 +345,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 		 * relevant unless we are at top level.
 		 */
 		index_is_ordered = OidIsValid(index->ordering[0]);
-		if (istoplevel && index_is_ordered && !isjoininner)
+		if (index_is_ordered && istoplevel && outer_rel == NULL)
 		{
 			index_pathkeys = build_index_pathkeys(root, index,
 												  ForwardScanDirection,
@@ -374,7 +370,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 									  index_is_ordered ?
 									  ForwardScanDirection :
 									  NoMovementScanDirection,
-									  isjoininner);
+									  outer_rel);
 			result = lappend(result, ipath);
 		}
 
@@ -383,7 +379,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 		 * that we failed to match, consider variant ways of achieving the
 		 * ordering.  Again, this is only interesting at top level.
 		 */
-		if (istoplevel && index_is_ordered && !isjoininner &&
+		if (index_is_ordered && istoplevel && outer_rel == NULL &&
 			root->query_pathkeys != NIL &&
 			pathkeys_useful_for_ordering(root, useful_pathkeys) == 0)
 		{
@@ -396,7 +392,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 										  restrictclauses,
 										  root->query_pathkeys,
 										  scandir,
-										  false);
+										  outer_rel);
 				result = lappend(result, ipath);
 			}
 		}
@@ -417,8 +413,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 static List *
 find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
 				List *clauses, List *outer_clauses,
-				bool istoplevel, bool isjoininner,
-				Relids outer_relids)
+				bool istoplevel, RelOptInfo *outer_rel)
 {
 	bool		have_saop = false;
 	ListCell   *l;
@@ -443,8 +438,7 @@ find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
 
 	return find_usable_indexes(root, rel,
 							   clauses, outer_clauses,
-							   istoplevel, isjoininner,
-							   outer_relids,
+							   istoplevel, outer_rel,
 							   SAOP_REQUIRE);
 }
 
@@ -457,13 +451,13 @@ find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
  *
  * outer_clauses is a list of additional clauses that can be assumed true
  * for the purpose of generating indexquals, but are not to be searched for
- * ORs.  (See find_usable_indexes() for motivation.)
+ * ORs.  (See find_usable_indexes() for motivation.)  outer_rel is the outer
+ * side when we are considering a nestloop inner indexpath.
  */
 List *
 generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 						 List *clauses, List *outer_clauses,
-						 bool isjoininner,
-						 Relids outer_relids)
+						 RelOptInfo *outer_rel)
 {
 	List	   *result = NIL;
 	List	   *all_clauses;
@@ -506,16 +500,14 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 											  andargs,
 											  all_clauses,
 											  false,
-											  isjoininner,
-											  outer_relids,
+											  outer_rel,
 											  SAOP_ALLOW);
 				/* Recurse in case there are sub-ORs */
 				indlist = list_concat(indlist,
 									  generate_bitmap_or_paths(root, rel,
 															   andargs,
 															   all_clauses,
-															   isjoininner,
-															   outer_relids));
+															   outer_rel));
 			}
 			else
 			{
@@ -525,8 +517,7 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 											  list_make1(orarg),
 											  all_clauses,
 											  false,
-											  isjoininner,
-											  outer_relids,
+											  outer_rel,
 											  SAOP_ALLOW);
 			}
 
@@ -724,7 +715,7 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths)
 	cost_bitmap_and_node(&apath, root);
 
 	/* Now we can do cost_bitmap_heap_scan */
-	cost_bitmap_heap_scan(&bpath, root, rel, (Path *) &apath, false);
+	cost_bitmap_heap_scan(&bpath, root, rel, (Path *) &apath);
 
 	return bpath.total_cost;
 }
@@ -1338,7 +1329,7 @@ matches_any_index(RestrictInfo *rinfo, RelOptInfo *rel, Relids outer_relids)
 /*
  * best_inner_indexscan
  *	  Finds the best available inner indexscan for a nestloop join
- *	  with the given rel on the inside and the given outer_relids outside.
+ *	  with the given rel on the inside and the given outer_rel outside.
  *	  May return NULL if there are no possible inner indexscans.
  *
  * We ignore ordering considerations (since a nestloop's inner scan's order
@@ -1353,8 +1344,9 @@ matches_any_index(RestrictInfo *rinfo, RelOptInfo *rel, Relids outer_relids)
  */
 Path *
 best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
-					 Relids outer_relids, JoinType jointype)
+					 RelOptInfo *outer_rel, JoinType jointype)
 {
+	Relids		outer_relids;
 	Path	   *cheapest;
 	bool		isouterjoin;
 	List	   *clause_list;
@@ -1396,11 +1388,11 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel));
 
 	/*
-	 * Intersect the given outer_relids with index_outer_relids to find the
+	 * Intersect the given outer relids with index_outer_relids to find the
 	 * set of outer relids actually relevant for this rel. If there are none,
 	 * again we can fail immediately.
 	 */
-	outer_relids = bms_intersect(rel->index_outer_relids, outer_relids);
+	outer_relids = bms_intersect(rel->index_outer_relids, outer_rel->relids);
 	if (bms_is_empty(outer_relids))
 	{
 		bms_free(outer_relids);
@@ -1413,6 +1405,13 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 * outerrels.  (We include the isouterjoin status in the cache lookup key
 	 * for safety.	In practice I suspect this is not necessary because it
 	 * should always be the same for a given innerrel.)
+	 *
+	 * NOTE: because we cache on outer_relids rather than outer_rel->relids,
+	 * we will report the same path and hence path cost for joins with
+	 * different sets of irrelevant rels on the outside.  Now that cost_index
+	 * is sensitive to outer_rel->rows, this is not really right.  However
+	 * the error is probably not large.  Is it worth establishing a separate
+	 * cache entry for each distinct outer_rel->relids set to get this right?
 	 */
 	foreach(l, rel->index_inner_paths)
 	{
@@ -1433,9 +1432,9 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 * that could be plain indexscans, ie, they don't require the join context
 	 * at all.  This may seem redundant, but we need to include those scans in
 	 * the input given to choose_bitmap_and() to be sure we find optimal AND
-	 * combinations of join and non-join scans.  The worst case is that we
-	 * might return a "best inner indexscan" that's really just a plain
-	 * indexscan, causing some redundant effort in joinpath.c.
+	 * combinations of join and non-join scans.  Also, even if the "best
+	 * inner indexscan" is just a plain indexscan, it will have a different
+	 * cost estimate because of cache effects.
 	 */
 	clause_list = find_clauses_for_join(root, rel, outer_relids, isouterjoin);
 
@@ -1445,8 +1444,7 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 */
 	indexpaths = find_usable_indexes(root, rel,
 									 clause_list, NIL,
-									 false, true,
-									 outer_relids,
+									 false, outer_rel,
 									 SAOP_FORBID);
 
 	/*
@@ -1455,8 +1453,7 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 */
 	bitindexpaths = generate_bitmap_or_paths(root, rel,
 											 clause_list, NIL,
-											 true,
-											 outer_relids);
+											 outer_rel);
 
 	/*
 	 * Likewise, generate paths using ScalarArrayOpExpr clauses; these can't
@@ -1465,8 +1462,7 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	bitindexpaths = list_concat(bitindexpaths,
 								find_saop_paths(root, rel,
 												clause_list, NIL,
-												false, true,
-												outer_relids));
+												false, outer_rel));
 
 	/*
 	 * Include the regular index paths in bitindexpaths.
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 9ae79cc19e..30cc23b672 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.103 2006/03/05 15:58:28 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.104 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -36,7 +36,7 @@ static void hash_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
 					 RelOptInfo *outerrel, RelOptInfo *innerrel,
 					 List *restrictlist, JoinType jointype);
 static Path *best_appendrel_indexscan(PlannerInfo *root, RelOptInfo *rel,
-									  Relids outer_relids, JoinType jointype);
+									 RelOptInfo *outer_rel, JoinType jointype);
 static List *select_mergejoin_clauses(RelOptInfo *joinrel,
 						 RelOptInfo *outerrel,
 						 RelOptInfo *innerrel,
@@ -401,12 +401,10 @@ match_unsorted_outer(PlannerInfo *root,
 		{
 			if (IsA(inner_cheapest_total, AppendPath))
 				bestinnerjoin = best_appendrel_indexscan(root, innerrel,
-														 outerrel->relids,
-														 jointype);
+														 outerrel, jointype);
 			else if (innerrel->rtekind == RTE_RELATION)
 				bestinnerjoin = best_inner_indexscan(root, innerrel,
-													 outerrel->relids,
-													 jointype);
+													 outerrel, jointype);
 		}
 	}
 
@@ -791,13 +789,13 @@ hash_inner_and_outer(PlannerInfo *root,
 /*
  * best_appendrel_indexscan
  *	  Finds the best available set of inner indexscans for a nestloop join
- *	  with the given append relation on the inside and the given outer_relids
+ *	  with the given append relation on the inside and the given outer_rel
  *	  outside.  Returns an AppendPath comprising the best inner scans, or
  *	  NULL if there are no possible inner indexscans.
  */
 static Path *
 best_appendrel_indexscan(PlannerInfo *root, RelOptInfo *rel,
-						 Relids outer_relids, JoinType jointype)
+						 RelOptInfo *outer_rel, JoinType jointype)
 {
 	int			parentRTindex = rel->relid;
 	List	   *append_paths = NIL;
@@ -832,7 +830,7 @@ best_appendrel_indexscan(PlannerInfo *root, RelOptInfo *rel,
 		 * Get the best innerjoin indexpath (if any) for this child rel.
 		 */
 		bestinnerjoin = best_inner_indexscan(root, childrel,
-											 outer_relids, jointype);
+											 outer_rel, jointype);
 
 		/*
 		 * If no luck on an indexpath for this rel, we'll still consider
diff --git a/src/backend/optimizer/path/orindxpath.c b/src/backend/optimizer/path/orindxpath.c
index 252835c5fa..988145ca18 100644
--- a/src/backend/optimizer/path/orindxpath.c
+++ b/src/backend/optimizer/path/orindxpath.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.78 2006/03/05 15:58:28 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.79 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -107,7 +107,7 @@ create_or_index_quals(PlannerInfo *root, RelOptInfo *rel)
 			 * Use the generate_bitmap_or_paths() machinery to estimate the
 			 * value of each OR clause.  We can use regular restriction
 			 * clauses along with the OR clause contents to generate
-			 * indexquals.	We pass outer_relids = NULL so that sub-clauses
+			 * indexquals.	We pass outer_rel = NULL so that sub-clauses
 			 * that are actually joins will be ignored.
 			 */
 			List	   *orpaths;
@@ -116,7 +116,7 @@ create_or_index_quals(PlannerInfo *root, RelOptInfo *rel)
 			orpaths = generate_bitmap_or_paths(root, rel,
 											   list_make1(rinfo),
 											   rel->baserestrictinfo,
-											   false, NULL);
+											   NULL);
 
 			/* Locate the cheapest OR path */
 			foreach(k, orpaths)
diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index c2a1199d6c..c567f8b6b7 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planagg.c,v 1.14 2006/04/28 20:57:49 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planagg.c,v 1.15 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -373,7 +373,7 @@ build_minmax_path(PlannerInfo *root, RelOptInfo *rel, MinMaxAggInfo *info)
 									 restrictclauses,
 									 NIL,
 									 indexscandir,
-									 false);
+									 NULL);
 
 		/*
 		 * Estimate actual cost of fetching just one row.
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 170f260bfb..f827007414 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.127 2006/03/05 15:58:31 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.128 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -420,8 +420,8 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel)
  * 'indexscandir' is ForwardScanDirection or BackwardScanDirection
  *			for an ordered index, or NoMovementScanDirection for
  *			an unordered index.
- * 'isjoininner' is TRUE if this is a join inner indexscan path.
- *			(pathkeys and indexscandir are ignored if so.)
+ * 'outer_rel' is the outer relation if this is a join inner indexscan path.
+ *			(pathkeys and indexscandir are ignored if so.)  NULL if not.
  *
  * Returns the new path node.
  */
@@ -431,7 +431,7 @@ create_index_path(PlannerInfo *root,
 				  List *clause_groups,
 				  List *pathkeys,
 				  ScanDirection indexscandir,
-				  bool isjoininner)
+				  RelOptInfo *outer_rel)
 {
 	IndexPath  *pathnode = makeNode(IndexPath);
 	RelOptInfo *rel = index->rel;
@@ -445,7 +445,7 @@ create_index_path(PlannerInfo *root,
 	 * don't really care what order it's scanned in.  (We could expect the
 	 * caller to supply the correct values, but it's easier to force it here.)
 	 */
-	if (isjoininner)
+	if (outer_rel != NULL)
 	{
 		pathkeys = NIL;
 		indexscandir = NoMovementScanDirection;
@@ -466,10 +466,10 @@ create_index_path(PlannerInfo *root,
 	pathnode->indexclauses = allclauses;
 	pathnode->indexquals = indexquals;
 
-	pathnode->isjoininner = isjoininner;
+	pathnode->isjoininner = (outer_rel != NULL);
 	pathnode->indexscandir = indexscandir;
 
-	if (isjoininner)
+	if (outer_rel != NULL)
 	{
 		/*
 		 * We must compute the estimated number of output rows for the
@@ -505,7 +505,7 @@ create_index_path(PlannerInfo *root,
 		pathnode->rows = rel->rows;
 	}
 
-	cost_index(pathnode, root, index, indexquals, isjoininner);
+	cost_index(pathnode, root, index, indexquals, outer_rel);
 
 	return pathnode;
 }
@@ -515,6 +515,9 @@ create_index_path(PlannerInfo *root,
  *	  Creates a path node for a bitmap scan.
  *
  * 'bitmapqual' is a tree of IndexPath, BitmapAndPath, and BitmapOrPath nodes.
+ *
+ * If this is a join inner indexscan path, the component IndexPaths should
+ * have been costed accordingly, and TRUE should be passed for isjoininner.
  */
 BitmapHeapPath *
 create_bitmap_heap_path(PlannerInfo *root,
@@ -560,7 +563,7 @@ create_bitmap_heap_path(PlannerInfo *root,
 		pathnode->rows = rel->rows;
 	}
 
-	cost_bitmap_heap_scan(&pathnode->path, root, rel, bitmapqual, false);
+	cost_bitmap_heap_scan(&pathnode->path, root, rel, bitmapqual);
 
 	return pathnode;
 }
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c75ceef373..8bf517e420 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.206 2006/06/05 02:49:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.207 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -4465,6 +4465,7 @@ string_to_bytea_const(const char *str, size_t str_len)
 static void
 genericcostestimate(PlannerInfo *root,
 					IndexOptInfo *index, List *indexQuals,
+					RelOptInfo *outer_rel,
 					double numIndexTuples,
 					Cost *indexStartupCost,
 					Cost *indexTotalCost,
@@ -4538,26 +4539,61 @@ genericcostestimate(PlannerInfo *root,
 	/*
 	 * Estimate the number of index pages that will be retrieved.
 	 *
-	 * For all currently-supported index types, the first page of the index is
-	 * a metadata page, and we should figure on fetching that plus a pro-rated
-	 * fraction of the remaining pages.
+	 * We use the simplistic method of taking a pro-rata fraction of the
+	 * total number of index pages.  In effect, this counts only leaf pages
+	 * and not any overhead such as index metapage or upper tree levels.
+	 * In practice this seems a better approximation than charging for
+	 * access to the upper levels, perhaps because those tend to stay in
+	 * cache under load.
 	 */
-	if (index->pages > 1 && index->tuples > 0)
-	{
-		numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
-		numIndexPages += 1;		/* count the metapage too */
-		numIndexPages = ceil(numIndexPages);
-	}
+	if (index->pages > 1 && index->tuples > 1)
+		numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
 	else
 		numIndexPages = 1.0;
 
 	/*
-	 * Compute the index access cost.
+	 * Now compute the disk access costs.
 	 *
-	 * Disk cost: our generic assumption is that the index pages will be read
-	 * sequentially, so they cost seq_page_cost each, not random_page_cost.
+	 * The above calculations are all per-index-scan.  However, if we are
+	 * in a nestloop inner scan, we can expect the scan to be repeated (with
+	 * different search keys) for each row of the outer relation.  This
+	 * creates the potential for cache effects to reduce the number of
+	 * disk page fetches needed.  We want to estimate the average per-scan
+	 * I/O cost in the presence of caching.
+	 *
+	 * We use the Mackert-Lohman formula (see costsize.c for details) to
+	 * estimate the total number of page fetches that occur.  While this
+	 * wasn't what it was designed for, it seems a reasonable model anyway.
+	 * Note that we are counting pages not tuples anymore, so we take
+	 * N = T = index size, as if there were one "tuple" per page.
 	 */
-	*indexTotalCost = seq_page_cost * numIndexPages;
+	if (outer_rel != NULL && outer_rel->rows > 1)
+	{
+		double		num_scans = outer_rel->rows;
+		double		pages_fetched;
+
+		/* total page fetches ignoring cache effects */
+		pages_fetched = numIndexPages * num_scans;
+
+		/* use Mackert and Lohman formula to adjust for cache effects */
+		pages_fetched = index_pages_fetched(pages_fetched,
+											index->pages,
+											index->rel->pages);
+
+		/*
+		 * Now compute the total disk access cost, and then report a
+		 * pro-rated share for one index scan.
+		 */
+		*indexTotalCost = (pages_fetched * random_page_cost) / num_scans;
+	}
+	else
+	{
+		/*
+		 * For a single index scan, we just charge random_page_cost per page
+		 * touched.
+		 */
+		*indexTotalCost = numIndexPages * random_page_cost;
+	}
 
 	/*
 	 * CPU cost: any complex expressions in the indexquals will need to be
@@ -4594,10 +4630,11 @@ btcostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 	Oid			relid;
 	AttrNumber	colnum;
 	HeapTuple	tuple;
@@ -4728,7 +4765,7 @@ btcostestimate(PG_FUNCTION_ARGS)
 		numIndexTuples = btreeSelectivity * index->rel->tuples;
 	}
 
-	genericcostestimate(root, index, indexQuals, numIndexTuples,
+	genericcostestimate(root, index, indexQuals, outer_rel, numIndexTuples,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);
 
@@ -4800,12 +4837,13 @@ hashcostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 
-	genericcostestimate(root, index, indexQuals, 0.0,
+	genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);
 
@@ -4818,12 +4856,13 @@ gistcostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 
-	genericcostestimate(root, index, indexQuals, 0.0,
+	genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);
 
@@ -4836,12 +4875,13 @@ gincostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 
-	genericcostestimate(root, index, indexQuals, 0.0,
+	genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);
 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index c101a3c654..e0aa6ec3f3 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.412 2006/05/19 19:08:26 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.413 2006/06/06 17:59:57 tgl Exp $
  *
  * NOTES
  *	  The script catalog/genbki.sh reads this file and generates .bki
@@ -678,7 +678,7 @@ DATA(insert OID = 332 (  btbulkdelete	   PGNSP PGUID 12 f f t f v 4 2281 "2281 2
 DESCR("btree(internal)");
 DATA(insert OID = 972 (  btvacuumcleanup   PGNSP PGUID 12 f f t f v 2 2281 "2281 2281" _null_ _null_ _null_ btvacuumcleanup - _null_ ));
 DESCR("btree(internal)");
-DATA(insert OID = 1268 (  btcostestimate   PGNSP PGUID 12 f f t f v 7 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  btcostestimate - _null_ ));
+DATA(insert OID = 1268 (  btcostestimate   PGNSP PGUID 12 f f t f v 8 2278 "2281 2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  btcostestimate - _null_ ));
 DESCR("btree(internal)");
 
 DATA(insert OID = 339 (  poly_same		   PGNSP PGUID 12 f f t f i 2 16 "604 604" _null_ _null_ _null_ poly_same - _null_ ));
@@ -795,7 +795,7 @@ DATA(insert OID = 442 (  hashbulkdelete    PGNSP PGUID 12 f f t f v 4 2281 "2281
 DESCR("hash(internal)");
 DATA(insert OID = 425 (  hashvacuumcleanup PGNSP PGUID 12 f f t f v 2 2281 "2281 2281" _null_ _null_ _null_ hashvacuumcleanup - _null_ ));
 DESCR("hash(internal)");
-DATA(insert OID = 438 (  hashcostestimate  PGNSP PGUID 12 f f t f v 7 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  hashcostestimate - _null_ ));
+DATA(insert OID = 438 (  hashcostestimate  PGNSP PGUID 12 f f t f v 8 2278 "2281 2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  hashcostestimate - _null_ ));
 DESCR("hash(internal)");
 
 DATA(insert OID = 449 (  hashint2		   PGNSP PGUID 12 f f t f i 1 23 "21" _null_ _null_ _null_	hashint2 - _null_ ));
@@ -1061,7 +1061,7 @@ DATA(insert OID = 776 (  gistbulkdelete    PGNSP PGUID 12 f f t f v 4 2281 "2281
 DESCR("gist(internal)");
 DATA(insert OID = 2561 (  gistvacuumcleanup   PGNSP PGUID 12 f f t f v 2 2281 "2281 2281" _null_ _null_ _null_ gistvacuumcleanup - _null_ ));
 DESCR("gist(internal)");
-DATA(insert OID = 772 (  gistcostestimate  PGNSP PGUID 12 f f t f v 7 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  gistcostestimate - _null_ ));
+DATA(insert OID = 772 (  gistcostestimate  PGNSP PGUID 12 f f t f v 8 2278 "2281 2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  gistcostestimate - _null_ ));
 DESCR("gist(internal)");
 
 DATA(insert OID = 784 (  tintervaleq	   PGNSP PGUID 12 f f t f i 2 16 "704 704" _null_ _null_ _null_ tintervaleq - _null_ ));
@@ -3847,7 +3847,7 @@ DATA(insert OID = 2739 (  ginbulkdelete    PGNSP PGUID 12 f f t f v 4 2281 "2281
 DESCR("gin(internal)");
 DATA(insert OID = 2740 (  ginvacuumcleanup PGNSP PGUID 12 f f t f v 2 2281 "2281 2281" _null_ _null_ _null_ ginvacuumcleanup - _null_ ));
 DESCR("gin(internal)");
-DATA(insert OID = 2741 (  gincostestimate  PGNSP PGUID 12 f f t f v 7 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  gincostestimate - _null_ ));
+DATA(insert OID = 2741 (  gincostestimate  PGNSP PGUID 12 f f t f v 8 2278 "2281 2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_  gincostestimate - _null_ ));
 DESCR("gin(internal)");
 
 /* GIN array support */
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index aa2ebc7a0e..1f971118f5 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.124 2006/03/05 15:58:57 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.125 2006/06/06 17:59:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -407,6 +407,9 @@ typedef struct Path
  *
  * 'isjoininner' is TRUE if the path is a nestloop inner scan (that is,
  * some of the index conditions are join rather than restriction clauses).
+ * Note that the path costs will be calculated differently from a plain
+ * indexscan in this case, and in addition there's a special 'rows' value
+ * different from the parent RelOptInfo's (see below).
  *
  * 'indexscandir' is one of:
  *		ForwardScanDirection: forward scan of an ordered index
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index ae3f0171fa..03aae9660a 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.75 2006/06/05 03:03:42 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.76 2006/06/06 17:59:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,7 +41,7 @@ extern DLLIMPORT double random_page_cost;
 extern DLLIMPORT double cpu_tuple_cost;
 extern DLLIMPORT double cpu_index_tuple_cost;
 extern DLLIMPORT double cpu_operator_cost;
-extern double effective_cache_size;
+extern DLLIMPORT double effective_cache_size;
 extern Cost disable_cost;
 extern bool enable_seqscan;
 extern bool enable_indexscan;
@@ -55,11 +55,13 @@ extern bool enable_hashjoin;
 extern bool constraint_exclusion;
 
 extern double clamp_row_est(double nrows);
+extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
+								  BlockNumber other_pages);
 extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel);
 extern void cost_index(IndexPath *path, PlannerInfo *root, IndexOptInfo *index,
-		   List *indexQuals, bool is_injoin);
+		   List *indexQuals, RelOptInfo *outer_rel);
 extern void cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
-					  Path *bitmapqual, bool is_injoin);
+					  Path *bitmapqual);
 extern void cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root);
 extern void cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root);
 extern void cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec);
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index cb7c1703c8..f728489be5 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.67 2006/03/05 15:58:57 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.68 2006/06/06 17:59:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -33,7 +33,7 @@ extern IndexPath *create_index_path(PlannerInfo *root,
 				  List *clause_groups,
 				  List *pathkeys,
 				  ScanDirection indexscandir,
-				  bool isjoininner);
+				  RelOptInfo *outer_rel);
 extern BitmapHeapPath *create_bitmap_heap_path(PlannerInfo *root,
 						RelOptInfo *rel,
 						Path *bitmapqual,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 9afb07497e..40eb9b4d7d 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.92 2006/03/05 15:58:57 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.93 2006/06/06 17:59:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -44,10 +44,9 @@ typedef enum
 extern void create_index_paths(PlannerInfo *root, RelOptInfo *rel);
 extern List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 						 List *clauses, List *outer_clauses,
-						 bool isjoininner,
-						 Relids outer_relids);
+						 RelOptInfo *outer_rel);
 extern Path *best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
-					 Relids outer_relids, JoinType jointype);
+					 RelOptInfo *outer_rel, JoinType jointype);
 extern List *group_clauses_by_indexkey(IndexOptInfo *index,
 						  List *clauses, List *outer_clauses,
 						  Relids outer_relids,
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index e41b11e831..67bcdd730e 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -6,7 +6,7 @@
  * for developers.	If you edit any of these, be sure to do a *full*
  * rebuild (and an initdb if noted).
  *
- * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.21 2006/04/03 23:35:05 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.22 2006/06/06 17:59:58 tgl Exp $
  *------------------------------------------------------------------------
  */
 
@@ -65,7 +65,7 @@
 /*
  * Maximum number of arguments to a function.
  *
- * The minimum value is 8 (index creation uses 8-argument functions).
+ * The minimum value is 8 (index cost estimation uses 8-argument functions).
  * The maximum possible value is around 600 (limited by index tuple size in
  * pg_proc's index; BLCKSZ larger than 8K would allow more).  Values larger
  * than needed will waste memory and processing time, but do not directly
-- 
2.50.0