From 9a9a143a980c3a4c9c825ce44abf57718f9f66cb Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 8 Jan 2007 16:09:22 +0000
Subject: [PATCH] Remove cost_hashjoin's very ancient hack to discourage (once,
 entirely forbid) hash joins with the estimated-larger relation on the inside.
  There are several cases where doing that makes perfect sense, and in cases
 where it doesn't, the regular cost computation really ought to be able to
 figure that out.  Make some marginal tweaks in said computation to try to get
 results approximating reality a bit better.  Per an example from Shane
 Ambler.

Also, fix an oversight in the original patch to add seq_page_cost: the costs
of spilling a hash join to disk should be scaled by seq_page_cost.
---
 src/backend/optimizer/path/costsize.c | 46 +++++++++------------------
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 1d04876716..69d1a890e4 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.172 2007/01/05 22:19:31 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.173 2007/01/08 16:09:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1498,10 +1498,6 @@ cost_hashjoin(HashPath *path, PlannerInfo *root)
 	double		hashjointuples;
 	double		outer_path_rows = PATH_ROWS(outer_path);
 	double		inner_path_rows = PATH_ROWS(inner_path);
-	double		outerbytes = relation_byte_size(outer_path_rows,
-												outer_path->parent->width);
-	double		innerbytes = relation_byte_size(inner_path_rows,
-												inner_path->parent->width);
 	int			num_hashclauses = list_length(hashclauses);
 	int			numbuckets;
 	int			numbatches;
@@ -1538,13 +1534,16 @@ cost_hashjoin(HashPath *path, PlannerInfo *root)
 
 	/*
 	 * Cost of computing hash function: must do it once per input tuple. We
-	 * charge one cpu_operator_cost for each column's hash function.
+	 * charge one cpu_operator_cost for each column's hash function.  Also,
+	 * tack on one cpu_tuple_cost per inner row, to model the costs of
+	 * inserting the row into the hashtable.
 	 *
 	 * XXX when a hashclause is more complex than a single operator, we really
 	 * should charge the extra eval costs of the left or right side, as
 	 * appropriate, here.  This seems more work than it's worth at the moment.
 	 */
-	startup_cost += cpu_operator_cost * num_hashclauses * inner_path_rows;
+	startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
+		* inner_path_rows;
 	run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
 
 	/* Get hash table size that executor would use for inner relation */
@@ -1624,8 +1623,8 @@ cost_hashjoin(HashPath *path, PlannerInfo *root)
 	/*
 	 * If inner relation is too big then we will need to "batch" the join,
 	 * which implies writing and reading most of the tuples to disk an extra
-	 * time.  Charge one cost unit per page of I/O (correct since it should be
-	 * nice and sequential...).  Writing the inner rel counts as startup cost,
+	 * time.  Charge seq_page_cost per page, since the I/O should be nice and
+	 * sequential.  Writing the inner rel counts as startup cost,
 	 * all the rest as run cost.
 	 */
 	if (numbatches > 1)
@@ -1635,8 +1634,8 @@ cost_hashjoin(HashPath *path, PlannerInfo *root)
 		double		innerpages = page_size(inner_path_rows,
 										   inner_path->parent->width);
 
-		startup_cost += innerpages;
-		run_cost += innerpages + 2 * outerpages;
+		startup_cost += seq_page_cost * innerpages;
+		run_cost += seq_page_cost * (innerpages + 2 * outerpages);
 	}
 
 	/* CPU costs */
@@ -1654,14 +1653,15 @@ cost_hashjoin(HashPath *path, PlannerInfo *root)
 	 * The number of tuple comparisons needed is the number of outer tuples
 	 * times the typical number of tuples in a hash bucket, which is the inner
 	 * relation size times its bucketsize fraction.  At each one, we need to
-	 * evaluate the hashjoin quals.  (Note: charging the full qual eval cost
-	 * at each tuple is pessimistic, since we don't evaluate the quals unless
-	 * the hash values match exactly.)
+	 * evaluate the hashjoin quals.  But actually, charging the full qual eval
+	 * cost at each tuple is pessimistic, since we don't evaluate the quals
+	 * unless the hash values match exactly.  For lack of a better idea, halve
+	 * the cost estimate to allow for that.
 	 */
 	startup_cost += hash_qual_cost.startup;
 	run_cost += hash_qual_cost.per_tuple *
 		outer_path_rows * clamp_row_est(inner_path_rows * innerbucketsize) *
-		joininfactor;
+		joininfactor * 0.5;
 
 	/*
 	 * For each tuple that gets through the hashjoin proper, we charge
@@ -1673,22 +1673,6 @@ cost_hashjoin(HashPath *path, PlannerInfo *root)
 	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
 	run_cost += cpu_per_tuple * hashjointuples * joininfactor;
 
-	/*
-	 * Bias against putting larger relation on inside.	We don't want an
-	 * absolute prohibition, though, since larger relation might have better
-	 * bucketsize --- and we can't trust the size estimates unreservedly,
-	 * anyway.	Instead, inflate the run cost by the square root of the size
-	 * ratio.  (Why square root?  No real good reason, but it seems
-	 * reasonable...)
-	 *
-	 * Note: before 7.4 we implemented this by inflating startup cost; but if
-	 * there's a disable_cost component in the input paths' startup cost, that
-	 * unfairly penalizes the hash.  Probably it'd be better to keep track of
-	 * disable penalty separately from cost.
-	 */
-	if (innerbytes > outerbytes && outerbytes > 0)
-		run_cost *= sqrt(innerbytes / outerbytes);
-
 	path->jpath.path.startup_cost = startup_cost;
 	path->jpath.path.total_cost = startup_cost + run_cost;
 }
-- 
2.50.0