From f85155e18cb71a599724536e598e8d6f5e140454 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 3 Apr 2015 08:32:05 -0400 Subject: [PATCH] Change the way we decide whether to give up on abbreviated text keys. Be more aggressive about aborting early on if it looks like it's not helping, but be less aggressive about aborting later on, since it's more expensive at that point, and also since we're currently aborting in some cases where abbreviation can still deliver a substantial win. Peter Geoghegan. Extensive testing by Tomas Vondra. --- src/backend/utils/adt/varlena.c | 45 ++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 02e994972c..d874a1aec0 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -66,6 +66,7 @@ typedef struct bool collate_c; hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ hyperLogLogState full_card; /* Full key cardinality state */ + double prop_card; /* Required cardinality proportion */ #ifdef HAVE_LOCALE_T pg_locale_t locale; #endif @@ -1845,6 +1846,7 @@ btsortsupport_worker(SortSupport ssup, Oid collid) */ if (abbreviate) { + tss->prop_card = 0.20; initHyperLogLog(&tss->abbr_card, 10); initHyperLogLog(&tss->full_card, 10); ssup->abbrev_full_comparator = ssup->comparator; @@ -2125,7 +2127,7 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) Assert(ssup->abbreviate); /* Have a little patience */ - if (memtupcount < 20) + if (memtupcount < 100) return false; abbrev_distinct = estimateHyperLogLog(&tss->abbr_card); @@ -2151,8 +2153,9 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) { double norm_abbrev_card = abbrev_distinct / (double) memtupcount; - elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)", - memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card); + elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, + tss->prop_card); } #endif @@ -2172,8 +2175,38 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) * abbreviated comparison with a cheap memcmp()-based authoritative * resolution are equivalent. */ - if (abbrev_distinct > key_distinct * 0.05) + if (abbrev_distinct > key_distinct * tss->prop_card) + { + /* + * When we have exceeded 10,000 tuples, decay required cardinality + * aggressively for next call. + * + * This is useful because the number of comparisons required on average + * increases at a linearithmic rate, and at roughly 10,000 tuples that + * factor will start to dominate over the linear costs of string + * transformation (this is a conservative estimate). The decay rate is + * chosen to be a little less aggressive than halving -- which (since + * we're called at points at which memtupcount has doubled) would never + * see the cost model actually abort past the first call following a + * decay. This decay rate is mostly a precaution against a sudden, + * violent swing in how well abbreviated cardinality tracks full key + * cardinality. The decay also serves to prevent a marginal case from + * being aborted too late, when too much has already been invested in + * string transformation. + * + * It's possible for sets of several million distinct strings with mere + * tens of thousands of distinct abbreviated keys to still benefit very + * significantly. This will generally occur provided each abbreviated + * key is a proxy for a roughly uniform number of the set's full keys. + * If it isn't so, we hope to catch that early and abort. If it isn't + * caught early, by the time the problem is apparent it's probably not + * worth aborting. + */ + if (memtupcount > 10000) + tss->prop_card *= 0.65; + return false; + } /* * Abort abbreviation strategy. @@ -2187,8 +2220,8 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) * lose but much to gain, which our strategy reflects. */ #ifdef DEBUG_ABBREV_KEYS - elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f", - memtupcount, abbrev_distinct, key_distinct); + elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f, prop_card: %f", + memtupcount, abbrev_distinct, key_distinct, tss->prop_card); /* Actually abort only when debugging is disabled */ return false; #endif -- 2.40.0