From 82e1ba7fd6cc9ac3fb1d9b819dc7295b268d3703 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 23 Sep 2015 18:26:49 -0400 Subject: [PATCH] Make ANALYZE compute basic statistics even for types with no "=" operator. Previously, ANALYZE simply ignored columns of datatypes that have neither a btree nor hash opclass (which means they have no recognized equality operator). Without a notion of equality, we can't identify most-common values nor estimate the number of distinct values. But we can still count nulls and compute the average physical column width, and those stats might be of value. Moreover there are some tools out there that don't work so well if rows are missing from pg_statistic. So let's add suitable logic for this case. While this is arguably a bug fix, it also has the potential to change query plans, and the gain seems not worth taking a risk of that in stable branches. So back-patch into 9.5 but not further. Oleksandr Shulgin, rewritten a bit by me. --- src/backend/commands/analyze.c | 118 +++++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 14 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 861048f213..ddb68abf6b 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1689,10 +1689,14 @@ typedef struct } CompareScalarsContext; -static void compute_minimal_stats(VacAttrStatsP stats, +static void compute_trivial_stats(VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows); +static void compute_distinct_stats(VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows); static void compute_scalar_stats(VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, @@ -1723,21 +1727,17 @@ std_typanalyze(VacAttrStats *stats) <opr, &eqopr, NULL, NULL); - /* If column has no "=" operator, we can't do much of anything */ - if (!OidIsValid(eqopr)) - return false; - /* Save the operator info for compute_stats routines */ mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData)); mystats->eqopr = eqopr; - mystats->eqfunc = get_opcode(eqopr); + mystats->eqfunc = OidIsValid(eqopr) ? get_opcode(eqopr) : InvalidOid; mystats->ltopr = ltopr; stats->extra_data = mystats; /* * Determine which standard statistics algorithm to use */ - if (OidIsValid(ltopr)) + if (OidIsValid(eqopr) && OidIsValid(ltopr)) { /* Seems to be a scalar datatype */ stats->compute_stats = compute_scalar_stats; @@ -1762,10 +1762,17 @@ std_typanalyze(VacAttrStats *stats) */ stats->minrows = 300 * attr->attstattarget; } + else if (OidIsValid(eqopr)) + { + /* We can still recognize distinct values */ + stats->compute_stats = compute_distinct_stats; + /* Might as well use the same minrows as above */ + stats->minrows = 300 * attr->attstattarget; + } else { - /* Can't do much but the minimal stuff */ - stats->compute_stats = compute_minimal_stats; + /* Can't do much but the trivial stuff */ + stats->compute_stats = compute_trivial_stats; /* Might as well use the same minrows as above */ stats->minrows = 300 * attr->attstattarget; } @@ -1773,8 +1780,91 @@ std_typanalyze(VacAttrStats *stats) return true; } + +/* + * compute_trivial_stats() -- compute very basic column statistics + * + * We use this when we cannot find a hash "=" operator for the datatype. + * + * We determine the fraction of non-null rows and the average datum width. + */ +static void +compute_trivial_stats(VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows) +{ + int i; + int null_cnt = 0; + int nonnull_cnt = 0; + double total_width = 0; + bool is_varlena = (!stats->attrtype->typbyval && + stats->attrtype->typlen == -1); + bool is_varwidth = (!stats->attrtype->typbyval && + stats->attrtype->typlen < 0); + + for (i = 0; i < samplerows; i++) + { + Datum value; + bool isnull; + + vacuum_delay_point(); + + value = fetchfunc(stats, i, &isnull); + + /* Check for null/nonnull */ + if (isnull) + { + null_cnt++; + continue; + } + nonnull_cnt++; + + /* + * If it's a variable-width field, add up widths for average width + * calculation. Note that if the value is toasted, we use the toasted + * width. We don't bother with this calculation if it's a fixed-width + * type. + */ + if (is_varlena) + { + total_width += VARSIZE_ANY(DatumGetPointer(value)); + } + else if (is_varwidth) + { + /* must be cstring */ + total_width += strlen(DatumGetCString(value)) + 1; + } + } + + /* We can only compute average width if we found some non-null values. */ + if (nonnull_cnt > 0) + { + stats->stats_valid = true; + /* Do the simple null-frac and width stats */ + stats->stanullfrac = (double) null_cnt / (double) samplerows; + if (is_varwidth) + stats->stawidth = total_width / (double) nonnull_cnt; + else + stats->stawidth = stats->attrtype->typlen; + stats->stadistinct = 0.0; /* "unknown" */ + } + else if (null_cnt > 0) + { + /* We found only nulls; assume the column is entirely null */ + stats->stats_valid = true; + stats->stanullfrac = 1.0; + if (is_varwidth) + stats->stawidth = 0; /* "unknown" */ + else + stats->stawidth = stats->attrtype->typlen; + stats->stadistinct = 0.0; /* "unknown" */ + } +} + + /* - * compute_minimal_stats() -- compute minimal column statistics + * compute_distinct_stats() -- compute column statistics including ndistinct * * We use this when we can find only an "=" operator for the datatype. * @@ -1789,10 +1879,10 @@ std_typanalyze(VacAttrStats *stats) * depend mainly on the length of the list we are willing to keep. */ static void -compute_minimal_stats(VacAttrStatsP stats, - AnalyzeAttrFetchFunc fetchfunc, - int samplerows, - double totalrows) +compute_distinct_stats(VacAttrStatsP stats, + AnalyzeAttrFetchFunc fetchfunc, + int samplerows, + double totalrows) { int i; int null_cnt = 0; -- 2.40.0