From 2b74303637edc09cf692fbfab3fd93a5e47ccabf Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 16 Aug 2017 15:37:14 -0400
Subject: [PATCH] Make the planner assume that the entries in a VALUES list are
 distinct.

Previously, if we had to estimate the number of distinct values in a
VALUES column, we fell back on the default behavior used whenever we lack
statistics, which effectively is that there are Min(# of entries, 200)
distinct values.  This can be very badly off with a large VALUES list,
as noted by Jeff Janes.

We could consider actually running an ANALYZE-like scan on the VALUES,
but that seems unduly expensive, and anyway it could not deliver reliable
info if the entries are not all constants.  What seems like a better choice
is to assume that the values are all distinct.  This will sometimes be just
as wrong as the old code, but it seems more likely to be more nearly right
in many common cases.  Also, it is more consistent with what happens in
some related cases, for example WHERE x = ANY(ARRAY[1,2,3,...,n]) and
WHERE x = ANY(VALUES (1),(2),(3),...,(n)) now are estimated similarly.

This was discussed some time ago, but consensus was it'd be better
to slip it in at the start of a development cycle not near the end.
(It should've gone into v10, really, but I forgot about it.)

Discussion: https://postgr.es/m/CAMkU=1xHkyPa8VQgGcCNg3RMFFvVxUdOpus1gKcFuvVi0w6Acg@mail.gmail.com
---
 src/backend/utils/adt/selfuncs.c | 11 +++++++++++
 src/include/nodes/relation.h     |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index a7a06146a0..23e5526a8e 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5009,6 +5009,17 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 		 */
 		stadistinct = 2.0;
 	}
+	else if (vardata->rel && vardata->rel->rtekind == RTE_VALUES)
+	{
+		/*
+		 * If the Var represents a column of a VALUES RTE, assume it's unique.
+		 * This could of course be very wrong, but it should tend to be true
+		 * in well-written queries.  We could consider examining the VALUES'
+		 * contents to get some real statistics; but that only works if the
+		 * entries are all constants, and it would be pretty expensive anyway.
+		 */
+		stadistinct = -1.0;		/* unique (and all non null) */
+	}
 	else
 	{
 		/*
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index be2028867a..3ccc9d1b03 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -407,7 +407,7 @@ typedef struct PlannerInfo
  *
  *		relid - RTE index (this is redundant with the relids field, but
  *				is provided for convenience of access)
- *		rtekind - distinguishes plain relation, subquery, or function RTE
+ *		rtekind - copy of RTE's rtekind field
  *		min_attr, max_attr - range of valid AttrNumbers for rel
  *		attr_needed - array of bitmapsets indicating the highest joinrel
  *				in which each attribute is needed; if bit 0 is set then
@@ -552,7 +552,7 @@ typedef struct RelOptInfo
 	/* information about a base rel (not set for join rels!) */
 	Index		relid;
 	Oid			reltablespace;	/* containing tablespace */
-	RTEKind		rtekind;		/* RELATION, SUBQUERY, or FUNCTION */
+	RTEKind		rtekind;		/* RELATION, SUBQUERY, FUNCTION, etc */
 	AttrNumber	min_attr;		/* smallest attrno of rel (often <0) */
 	AttrNumber	max_attr;		/* largest attrno of rel */
 	Relids	   *attr_needed;	/* array indexed [min_attr .. max_attr] */
-- 
2.40.0