sortKey->ssup_nulls_first =
(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
sortKey->ssup_attno = scanKey->sk_attno;
+ /* Abbreviation is not supported here */
+ sortKey->abbreviate = false;
AssertState(sortKey->ssup_attno != 0);
/* We always use the default collation for statistics */
ssup.ssup_collation = DEFAULT_COLLATION_OID;
ssup.ssup_nulls_first = false;
+ /*
+ * For now, don't perform abbreviated key conversion, because full values
+ * are required for MCV slot generation. Supporting that optimization
+ * would necessitate teaching compare_scalars() to call a tie-breaker.
+ */
+ ssup.abbreviate = false;
PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
* We use a plain Datum sorter when there's a single input column;
* otherwise sort the full tuple. (See comments for
* process_ordered_aggregate_single.)
+ *
+ * In the future, we should consider forcing the
+ * tuplesort_begin_heap() case when the abbreviated key
+ * optimization can thereby be used, even when numInputs is 1.
*/
peraggstate->sortstate =
(peraggstate->numInputs == 1) ?
sortKey->ssup_nulls_first = node->nullsFirst[i];
sortKey->ssup_attno = node->sortColIdx[i];
+ /*
+ * It isn't feasible to perform abbreviated key conversion, since
+ * tuples are pulled into mergestate's binary heap as needed. It would
+ * likely be counter-productive to convert tuples into an abbreviated
+ * representation as they're pulled up, so opt out of that additional
+ * optimization entirely.
+ */
+ sortKey->abbreviate = false;
+
PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
}
elog(ERROR, "cannot merge using non-equality operator %u",
qual->opno);
+ /*
+ * sortsupport routine must know if abbreviation optimization is
+ * applicable in principle. It is never applicable for merge joins
+ * because there is no convenient opportunity to convert to alternative
+ * representation.
+ */
+ clause->ssup.abbreviate = false;
+
/* And get the matching support or comparison function */
Assert(clause->ssup.comparator == NULL);
sortfunc = get_opfamily_proc(opfamily,
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = ilist.o binaryheap.o pairingheap.o rbtree.o stringinfo.o
+OBJS = ilist.o binaryheap.o hyperloglog.o pairingheap.o rbtree.o stringinfo.o
include $(top_srcdir)/src/backend/common.mk
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * hyperloglog.c
+ * HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation. This is probably not ideally
+ * suited to estimating the cardinality of very large sets; in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ * src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/hyperloglog.h"
+
+#define POW_2_32 (4294967296.0)
+#define NEG_POW_2_32 (-4294967296.0)
+
+static inline uint8 rho(uint32 x, uint8 b);
+
+/*
+ * Initialize HyperLogLog track state
+ *
+ * bwidth is bit width (so register size will be 2 to the power of bwidth).
+ * Must be between 4 and 16 inclusive.
+ */
+void
+initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+{
+ double alpha;
+
+ if (bwidth < 4 || bwidth > 16)
+ elog(ERROR, "bit width must be between 4 and 16 inclusive");
+
+ cState->registerWidth = bwidth;
+ cState->nRegisters = 1 << bwidth;
+ cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+
+ /*
+ * Initialize hashes array to zero, not negative infinity, per discussion
+ * of the coupon collector problem in the HyperLogLog paper
+ */
+ cState->hashesArr = palloc0(cState->arrSize);
+
+ /*
+ * "alpha" is a value that for each possible number of registers (m) is
+ * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+ * is "the indicator function" through which we finally compute E,
+ * estimated cardinality).
+ */
+ switch (cState->nRegisters)
+ {
+ case 16:
+ alpha = 0.673;
+ break;
+ case 32:
+ alpha = 0.697;
+ break;
+ case 64:
+ alpha = 0.709;
+ break;
+ default:
+ alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+ }
+
+ /*
+ * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+ * estimate E
+ */
+ cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any(). The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging. There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+{
+ uint8 count;
+ uint32 index;
+
+ /* Use the first "k" (registerWidth) bits as a zero based index */
+ index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+ /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+ count = rho(hash << cState->registerWidth,
+ BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+ cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+}
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateHyperLogLog(hyperLogLogState *cState)
+{
+ double result;
+ double sum = 0.0;
+ int i;
+
+ for (i = 0; i < cState->nRegisters; i++)
+ {
+ sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+ }
+
+ /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+ result = cState->alphaMM / sum;
+
+ if (result <= (5.0 / 2.0) * cState->nRegisters)
+ {
+ /* Small range correction */
+ int zero_count = 0;
+
+ for (i = 0; i < cState->nRegisters; i++)
+ {
+ if (cState->hashesArr[i] == 0)
+ zero_count++;
+ }
+
+ if (zero_count != 0)
+ result = cState->nRegisters * log((double) cState->nRegisters /
+ zero_count);
+ }
+ else if (result > (1.0 / 30.0) * POW_2_32)
+ {
+ /* Large range correction */
+ result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+ }
+
+ return result;
+}
+
+/*
+ * Merges the estimate from one HyperLogLog state to another, returning the
+ * estimate of their union.
+ *
+ * The number of registers in each must match.
+ */
+void
+mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState)
+{
+ int r;
+
+ if (cState->nRegisters != oState->nRegisters)
+ elog(ERROR, "number of registers mismatch: %zu != %zu",
+ cState->nRegisters, oState->nRegisters);
+
+ for (r = 0; r < cState->nRegisters; ++r)
+ {
+ cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]);
+ }
+}
+
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000) returns 1
+ * rho(x = 0b0010000000) returns 3
+ * rho(x = 0b0000000000) returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+ uint8 j = 1;
+
+ while (j <= b && !(x & 0x80000000))
+ {
+ j++;
+ x <<= 1;
+ }
+
+ return j;
+}
osastate->qstate = qstate;
osastate->gcontext = gcontext;
- /* Initialize tuplesort object */
+ /*
+ * Initialize tuplesort object.
+ *
+ * In the future, we should consider forcing the tuplesort_begin_heap()
+ * case when the abbreviated key optimization can thereby be used, even
+ * when !use_tuples.
+ */
if (use_tuples)
osastate->sortstate = tuplesort_begin_heap(qstate->tupdesc,
qstate->numSortCols,
#include <ctype.h>
#include <limits.h>
+#include "access/hash.h"
#include "access/tuptoaster.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
+#include "lib/hyperloglog.h"
#include "libpq/md5.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "utils/pg_locale.h"
#include "utils/sortsupport.h"
+#ifdef DEBUG_ABBREV_KEYS
+#define DEBUG_elog_output DEBUG1
+#endif
/* GUC variable */
int bytea_output = BYTEA_OUTPUT_HEX;
typedef struct
{
- char *buf1; /* 1st string */
- char *buf2; /* 2nd string */
+ char *buf1; /* 1st string, or abbreviation original string buf */
+ char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
int buflen1;
int buflen2;
+ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
+ hyperLogLogState full_card; /* Full key cardinality state */
#ifdef HAVE_LOCALE_T
pg_locale_t locale;
#endif
static void btsortsupport_worker(SortSupport ssup, Oid collid);
static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
+static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
+static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
+static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
static int32 text_length(Datum str);
static text *text_catenate(text *t1, text *t2);
static text *text_substring(Datum str,
{
TextSortSupport *tss;
- /*
- * If LC_COLLATE = C, we can make things quite a bit faster by using
- * memcmp() rather than strcoll(). To minimize the per-comparison
- * overhead, we make this decision just once for the whole sort.
- */
- if (lc_collate_is_c(collid))
- {
- ssup->comparator = bttextfastcmp_c;
- return;
- }
-
/*
* WIN32 requires complex hacks when the database encoding is UTF-8 (except
* when using the "C" collation). For now, we don't optimize that case.
*/
#ifdef WIN32
- if (GetDatabaseEncoding() == PG_UTF8)
+ if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid))
return;
#endif
/*
+ * On platforms where the abbreviated key for text optimization might have
+ * bad worst case performance, it may be useful to avoid it entirely by
+ * disabling it at compile time. Having only 4 byte datums could make
+ * worst-case performance drastically more likely, for example. Moreover,
+ * Darwin's strxfrm() implementations is known to not effectively
+ * concentrate a significant amount of entropy from the original string in
+ * earlier transformed blobs. It's possible that other supported platforms
+ * are similarly encumbered.
+ *
+ * Any reasonable implementation will pack primary weights into the start
+ * of returned blobs. The canonical algorithm's implementation is
+ * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION
+ * ALGORITHM"), section 4, "Main algorithm". Section 4.3, "Form Sort Key"
+ * is of particular interest:
+ *
+ * http://www.unicode.org/reports/tr10/#Step_3
+ *
+ * The collation algorithm standard goes on to state:
+ *
+ * "By default, the algorithm makes use of three fully-customizable levels.
+ * For the Latin script, these levels correspond roughly to:
+ *
+ * alphabetic ordering
+ *
+ * diacritic ordering
+ *
+ * case ordering.
+ *
+ * A final level may be used for tie-breaking between strings not otherwise
+ * distinguished."
+ *
+ * It is generally expected that most non-equal keys will have their
+ * comparisons resolved at the primary level. If enough comparisons can be
+ * resolved with just 4 or 8 byte abbreviated keys, this optimization is
+ * very effective (although if there are many tie-breakers that largely
+ * only perform cheap memcmp() calls, that is also much faster than the
+ * unoptimized case - see bttext_abbrev_abort()).
+ *
* We may need a collation-sensitive comparison. To make things faster,
* we'll figure out the collation based on the locale id and cache the
* result. Also, since strxfrm()/strcoll() require NUL-terminated inputs,
#endif
}
- tss->buf1 = palloc(TEXTBUFLEN);
- tss->buflen1 = TEXTBUFLEN;
- tss->buf2 = palloc(TEXTBUFLEN);
- tss->buflen2 = TEXTBUFLEN;
+ /*
+ * If LC_COLLATE = C, we can make things quite a bit faster by using
+ * memcmp() rather than strcoll(). To minimize the per-comparison
+ * overhead, we make this decision just once for the whole sort.
+ *
+ * There is no reason to not at least perform fmgr elision on builds where
+ * abbreviation is disabled.
+ */
+ if (lc_collate_is_c(collid))
+ ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_c;
+ else
+ ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_locale;
+
+ if (!lc_collate_is_c(collid) || ssup->abbreviate)
+ {
+ /*
+ * Abbreviated case requires temp buffers for strxfrm() copying.
+ * bttextfastcmp_locale() also uses these buffers (even if abbreviation
+ * isn't used), while bttextfast_c() does not.
+ */
+ tss->buf1 = palloc(TEXTBUFLEN);
+ tss->buflen1 = TEXTBUFLEN;
+ tss->buf2 = palloc(TEXTBUFLEN);
+ tss->buflen2 = TEXTBUFLEN;
+ ssup->ssup_extra = tss;
+ }
+
+ if (!ssup->abbreviate)
+ return;
- ssup->ssup_extra = tss;
- ssup->comparator = bttextfastcmp_locale;
+ initHyperLogLog(&tss->abbr_card, 10);
+ initHyperLogLog(&tss->full_card, 10);
+
+ /*
+ * Change comparator to be abbreviation-based -- abbreviated version will
+ * probably ultimately be used during sorting proper, but core code may
+ * switch back to authoritative comparator should abbreviation be aborted
+ */
+ ssup->comparator = bttextcmp_abbrev;
+ ssup->abbrev_converter = bttext_abbrev_convert;
+ ssup->abbrev_abort = bttext_abbrev_abort;
}
/*
return result;
}
+/*
+ * Abbreviated key comparison func
+ */
+static int
+bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+ char *a = (char *) &x;
+ char *b = (char *) &y;
+ int result;
+
+ result = memcmp(a, b, sizeof(Datum));
+
+ /*
+ * When result = 0, the core system will call bttextfastcmp_c() or
+ * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm()
+ * blobs cannot indicate *equality* authoritatively, for the same reason
+ * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp().
+ */
+ return result;
+}
+
+/*
+ * Conversion routine for sortsupport. Converts original text to abbreviated
+ * key representation. Our encoding strategy is simple -- pack the first 8
+ * bytes of a strxfrm() blob into a Datum.
+ */
+static Datum
+bttext_abbrev_convert(Datum original, SortSupport ssup)
+{
+ TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
+ text *authoritative = DatumGetTextPP(original);
+
+ /* working state */
+ Datum res;
+ char *pres;
+ int len;
+ Size bsize;
+ uint32 hash;
+
+ /*
+ * Abbreviated key representation is a pass-by-value Datum that is treated
+ * as a char array by the specialized comparator bttextcmp_abbrev().
+ */
+ pres = (char *) &res;
+ /* memset(), so any non-overwritten bytes are NUL */
+ memset(pres, 0, sizeof(Datum));
+ len = VARSIZE_ANY_EXHDR(authoritative);
+
+ /* By convention, we use buffer 1 to store and NUL-terminate text */
+ if (len >= tss->buflen1)
+ {
+ pfree(tss->buf1);
+ tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
+ tss->buf1 = palloc(tss->buflen1);
+ }
+
+ /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
+ memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
+ tss->buf1[len] = '\0';
+
+ /* Don't leak memory here */
+ if (PointerGetDatum(authoritative) != original)
+ pfree(authoritative);
+
+retry:
+
+ /*
+ * There is no special handling of the C locale here, unlike with
+ * varstr_cmp(). strxfrm() is used indifferently.
+ */
+#ifdef HAVE_LOCALE_T
+ if (tss->locale)
+ bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale);
+ else
+#endif
+ bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
+
+ if (bsize >= tss->buflen2)
+ {
+ /*
+ * The C standard states that the contents of the buffer is now
+ * unspecified. Grow buffer, and retry.
+ */
+ pfree(tss->buf2);
+ tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+ tss->buf2 = palloc(tss->buflen2);
+ goto retry;
+ }
+
+ /*
+ * Maintain approximate cardinality of both abbreviated keys and original,
+ * authoritative keys using HyperLogLog. Used as cheap insurance against
+ * the worst case, where we do many string transformations for no saving in
+ * full strcoll()-based comparisons. These statistics are used by
+ * bttext_abbrev_abort().
+ *
+ * First, Hash key proper, or a significant fraction of it. Mix in length
+ * in order to compensate for cases where differences are past
+ * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+ */
+ hash = hash_any((unsigned char *) tss->buf1, Min(len, PG_CACHE_LINE_SIZE));
+
+ if (len > PG_CACHE_LINE_SIZE)
+ hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+
+ addHyperLogLog(&tss->full_card, hash);
+
+ memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
+
+ /* Hash abbreviated key */
+#if SIZEOF_DATUM == 8
+ {
+ uint32 lohalf,
+ hihalf;
+
+ lohalf = (uint32) res;
+ hihalf = (uint32) (res >> 32);
+ hash = hash_uint32(lohalf ^ hihalf);
+ }
+#else /* SIZEOF_DATUM != 8 */
+ hash = hash_uint32((uint32) res);
+#endif
+
+ addHyperLogLog(&tss->abbr_card, hash);
+
+ /*
+ * Every Datum byte is always compared. This is safe because the strxfrm()
+ * blob is itself NUL terminated, leaving no danger of misinterpreting any
+ * NUL bytes not intended to be interpreted as logically representing
+ * termination.
+ */
+ return res;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization, using
+ * heuristic rules. Returns value indicating if the abbreviation optimization
+ * should be aborted, based on its projected effectiveness.
+ */
+static bool
+bttext_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+ TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
+ double abbrev_distinct, key_distinct;
+
+ Assert(ssup->abbreviate);
+
+ /* Have a little patience */
+ if (memtupcount < 20)
+ return false;
+
+ abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
+ key_distinct = estimateHyperLogLog(&tss->full_card);
+
+ /*
+ * Clamp cardinality estimates to at least one distinct value. While NULLs
+ * are generally disregarded, if only NULL values were seen so far, that
+ * might misrepresent costs if we failed to clamp.
+ */
+ if (abbrev_distinct <= 1.0)
+ abbrev_distinct = 1.0;
+
+ if (key_distinct <= 1.0)
+ key_distinct = 1.0;
+
+ /*
+ * In the worst case all abbreviated keys are identical, while at the same
+ * time there are differences within full key strings not captured in
+ * abbreviations.
+ */
+#ifdef DEBUG_ABBREV_KEYS
+ {
+ double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+
+ elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
+ memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
+ }
+#endif
+
+ /*
+ * If the number of distinct abbreviated keys approximately matches the
+ * number of distinct authoritative original keys, that's reason enough to
+ * proceed. We can win even with a very low cardinality set if most
+ * tie-breakers only memcmp(). This is by far the most important
+ * consideration.
+ *
+ * While comparisons that are resolved at the abbreviated key level are
+ * considerably cheaper than tie-breakers resolved with memcmp(), both of
+ * those two outcomes are so much cheaper than a full strcoll() once
+ * sorting is underway that it doesn't seem worth it to weigh abbreviated
+ * cardinality against the overall size of the set in order to more
+ * accurately model costs. Assume that an abbreviated comparison, and an
+ * abbreviated comparison with a cheap memcmp()-based authoritative
+ * resolution are equivalent.
+ */
+ if (abbrev_distinct > key_distinct * 0.05)
+ return false;
+
+ /*
+ * Abort abbreviation strategy.
+ *
+ * The worst case, where all abbreviated keys are identical while all
+ * original strings differ will typically only see a regression of about
+ * 10% in execution time for small to medium sized lists of strings.
+ * Whereas on modern CPUs where cache stalls are the dominant cost, we can
+ * often expect very large improvements, particularly with sets of strings
+ * of moderately high to high abbreviated cardinality. There is little to
+ * lose but much to gain, which our strategy reflects.
+ */
+#ifdef DEBUG_ABBREV_KEYS
+ elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
+ memtupcount, abbrev_distinct, key_distinct);
+ /* Actually abort only when debugging is disabled */
+ return false;
+#endif
+
+ return true;
+}
+
Datum
text_larger(PG_FUNCTION_ARGS)
{
* When sorting single Datums, the data value is represented directly by
* datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false,
* then datum1 points to a separately palloc'd data value that is also pointed
- * to by the "tuple" pointer; otherwise "tuple" is NULL.
+ * to by the "tuple" pointer; otherwise "tuple" is NULL. There is one special
+ * case: when the sort support infrastructure provides an "abbreviated key"
+ * representation, where the key is (typically) a pass by value proxy for a
+ * pass by reference type.
*
* While building initial runs, tupindex holds the tuple's run number. During
* merge passes, we re-use it to hold the input tape number that each tuple in
*/
SortSupport onlyKey;
+ /*
+ * Additional state for managing "abbreviated key" sortsupport routines
+ * (which currently may be used by all cases except the Datum sort case and
+ * hash index case). Tracks the intervals at which the optimization's
+ * effectiveness is tested.
+ */
+ int64 abbrevNext; /* Tuple # at which to next check applicability */
+
/*
* These variables are specific to the CLUSTER case; they are set by
* tuplesort_begin_cluster.
static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
+static bool consider_abort_common(Tuplesortstate *state);
static void inittapes(Tuplesortstate *state);
static void selectnewtape(Tuplesortstate *state);
static void mergeruns(Tuplesortstate *state);
state->readtup = readtup_heap;
state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
+ state->abbrevNext = 10;
/* Prepare SortSupport data for each column */
state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
sortKey->ssup_collation = sortCollations[i];
sortKey->ssup_nulls_first = nullsFirstFlags[i];
sortKey->ssup_attno = attNums[i];
+ /* Convey if abbreviation optimization is applicable in principle */
+ sortKey->abbreviate = (i == 0);
PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
}
- if (nkeys == 1)
+ /*
+ * The "onlyKey" optimization cannot be used with abbreviated keys, since
+ * tie-breaker comparisons may be required. Typically, the optimization is
+ * only of value to pass-by-value types anyway, whereas abbreviated keys
+ * are typically only of value to pass-by-reference types.
+ */
+ if (nkeys == 1 && !state->sortKeys->abbrev_converter)
state->onlyKey = state->sortKeys;
MemoryContextSwitchTo(oldcontext);
state->copytup = copytup_cluster;
state->writetup = writetup_cluster;
state->readtup = readtup_cluster;
+ state->abbrevNext = 10;
state->indexInfo = BuildIndexInfo(indexRel);
sortKey->ssup_nulls_first =
(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
sortKey->ssup_attno = scanKey->sk_attno;
+ /* Convey if abbreviation optimization is applicable in principle */
+ sortKey->abbreviate = (i == 0);
AssertState(sortKey->ssup_attno != 0);
state->copytup = copytup_index;
state->writetup = writetup_index;
state->readtup = readtup_index;
+ state->abbrevNext = 10;
state->heapRel = heapRel;
state->indexRel = indexRel;
sortKey->ssup_nulls_first =
(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
sortKey->ssup_attno = scanKey->sk_attno;
+ /* Convey if abbreviation optimization is applicable in principle */
+ sortKey->abbreviate = (i == 0);
AssertState(sortKey->ssup_attno != 0);
state->onlyKey->ssup_cxt = CurrentMemoryContext;
state->onlyKey->ssup_collation = sortCollation;
state->onlyKey->ssup_nulls_first = nullsFirstFlag;
+ /*
+ * Conversion to abbreviated representation infeasible in the Datum case.
+ * It must be possible to subsequently fetch original datum values within
+ * tuplesort_getdatum(), which would require special-case preservation of
+ * original values.
+ */
+ state->onlyKey->abbreviate = false;
PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey);
state->bounded = true;
state->bound = (int) bound;
+
+ /*
+ * Bounded sorts are not an effective target for abbreviated key
+ * optimization. Disable by setting state to be consistent with no
+ * abbreviation support.
+ */
+ state->sortKeys->abbrev_converter = NULL;
+ if (state->sortKeys->abbrev_full_comparator)
+ state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
+
+ /* Not strictly necessary, but be tidy */
+ state->sortKeys->abbrev_abort = NULL;
+ state->sortKeys->abbrev_full_comparator = NULL;
}
/*
{
MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
SortTuple stup;
+ Datum original;
+ IndexTuple tuple;
stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
- ((IndexTuple) stup.tuple)->t_tid = *self;
+ tuple = ((IndexTuple) stup.tuple);
+ tuple->t_tid = *self;
USEMEM(state, GetMemoryChunkSpace(stup.tuple));
/* set up first-column key value */
- stup.datum1 = index_getattr((IndexTuple) stup.tuple,
- 1,
- RelationGetDescr(state->indexRel),
- &stup.isnull1);
+ original = index_getattr(tuple,
+ 1,
+ RelationGetDescr(state->indexRel),
+ &stup.isnull1);
+
+ if (!state->sortKeys->abbrev_converter || stup.isnull1)
+ {
+ /*
+ * Store ordinary Datum representation, or NULL value. If there is a
+ * converter it won't expect NULL values, and cost model is not
+ * required to account for NULL, so in that case we avoid calling
+ * converter and just set datum1 to "void" representation (to be
+ * consistent).
+ */
+ stup.datum1 = original;
+ }
+ else if (!consider_abort_common(state))
+ {
+ /* Store abbreviated key representation */
+ stup.datum1 = state->sortKeys->abbrev_converter(original,
+ state->sortKeys);
+ }
+ else
+ {
+ /* Abort abbreviation */
+ int i;
+
+ stup.datum1 = original;
+
+ /*
+ * Set state to be consistent with never trying abbreviation.
+ *
+ * Alter datum1 representation in already-copied tuples, so as to
+ * ensure a consistent representation (current tuple was just handled).
+ * Note that we rely on all tuples copied so far actually being
+ * contained within memtuples array.
+ */
+ for (i = 0; i < state->memtupcount; i++)
+ {
+ SortTuple *mtup = &state->memtuples[i];
+
+ tuple = mtup->tuple;
+ mtup->datum1 = index_getattr(tuple,
+ 1,
+ RelationGetDescr(state->indexRel),
+ &stup.isnull1);
+ }
+ }
+
puttuple_common(state, &stup);
MemoryContextSwitchTo(oldcontext);
}
}
+static bool
+consider_abort_common(Tuplesortstate *state)
+{
+ Assert(state->sortKeys[0].abbrev_converter != NULL);
+ Assert(state->sortKeys[0].abbrev_abort != NULL);
+ Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
+
+ /*
+ * Check effectiveness of abbreviation optimization. Consider aborting
+ * when still within memory limit.
+ */
+ if (state->status == TSS_INITIAL &&
+ state->memtupcount >= state->abbrevNext)
+ {
+ state->abbrevNext *= 2;
+
+ /*
+ * Check opclass-supplied abbreviation abort routine. It may
+ * indicate that abbreviation should not proceed.
+ */
+ if (!state->sortKeys->abbrev_abort(state->memtupcount,
+ state->sortKeys))
+ return false;
+
+ /*
+ * Finally, restore authoritative comparator, and indicate that
+ * abbreviation is not in play by setting abbrev_converter to NULL
+ */
+ state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
+ state->sortKeys[0].abbrev_converter = NULL;
+ /* Not strictly necessary, but be tidy */
+ state->sortKeys[0].abbrev_abort = NULL;
+ state->sortKeys[0].abbrev_full_comparator = NULL;
+
+ /* Give up - expect original pass-by-value representation */
+ return true;
+ }
+
+ return false;
+}
+
/*
* All tuples have been provided; finish the sort.
*/
TupleDesc tupDesc;
int nkey;
int32 compare;
+ AttrNumber attno;
+ Datum datum1,
+ datum2;
+ bool isnull1,
+ isnull2;
+
/* Compare the leading sort key */
compare = ApplySortComparator(a->datum1, a->isnull1,
rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
tupDesc = state->tupDesc;
+
+ if (sortKey->abbrev_converter)
+ {
+ attno = sortKey->ssup_attno;
+
+ datum1 = heap_getattr(<up, attno, tupDesc, &isnull1);
+ datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
+
+ compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+ datum2, isnull2,
+ sortKey);
+ if (compare != 0)
+ return compare;
+ }
+
sortKey++;
for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
{
- AttrNumber attno = sortKey->ssup_attno;
- Datum datum1,
- datum2;
- bool isnull1,
- isnull2;
+ attno = sortKey->ssup_attno;
datum1 = heap_getattr(<up, attno, tupDesc, &isnull1);
datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
* MinimalTuple using the exported interface for that.
*/
TupleTableSlot *slot = (TupleTableSlot *) tup;
+ Datum original;
MinimalTuple tuple;
HeapTupleData htup;
/* set up first-column key value */
htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
- stup->datum1 = heap_getattr(&htup,
- state->sortKeys[0].ssup_attno,
- state->tupDesc,
- &stup->isnull1);
+ original = heap_getattr(&htup,
+ state->sortKeys[0].ssup_attno,
+ state->tupDesc,
+ &stup->isnull1);
+
+ if (!state->sortKeys->abbrev_converter || stup->isnull1)
+ {
+ /*
+ * Store ordinary Datum representation, or NULL value. If there is a
+ * converter it won't expect NULL values, and cost model is not
+ * required to account for NULL, so in that case we avoid calling
+ * converter and just set datum1 to "void" representation (to be
+ * consistent).
+ */
+ stup->datum1 = original;
+ }
+ else if (!consider_abort_common(state))
+ {
+ /* Store abbreviated key representation */
+ stup->datum1 = state->sortKeys->abbrev_converter(original,
+ state->sortKeys);
+ }
+ else
+ {
+ /* Abort abbreviation */
+ int i;
+
+ stup->datum1 = original;
+
+ /*
+ * Set state to be consistent with never trying abbreviation.
+ *
+ * Alter datum1 representation in already-copied tuples, so as to
+ * ensure a consistent representation (current tuple was just handled).
+ * Note that we rely on all tuples copied so far actually being
+ * contained within memtuples array.
+ */
+ for (i = 0; i < state->memtupcount; i++)
+ {
+ SortTuple *mtup = &state->memtuples[i];
+
+ htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
+ MINIMAL_TUPLE_OFFSET;
+ htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
+ MINIMAL_TUPLE_OFFSET);
+
+ mtup->datum1 = heap_getattr(&htup,
+ state->sortKeys[0].ssup_attno,
+ state->tupDesc,
+ &mtup->isnull1);
+ }
+ }
}
static void
TupleDesc tupDesc;
int nkey;
int32 compare;
+ Datum datum1,
+ datum2;
+ bool isnull1,
+ isnull2;
+ AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0];
+
+ /* Be prepared to compare additional sort keys */
+ ltup = (HeapTuple) a->tuple;
+ rtup = (HeapTuple) b->tuple;
+ tupDesc = state->tupDesc;
/* Compare the leading sort key, if it's simple */
- if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
+ if (leading != 0)
{
compare = ApplySortComparator(a->datum1, a->isnull1,
b->datum1, b->isnull1,
sortKey);
+ if (compare != 0)
+ return compare;
+
+ if (sortKey->abbrev_converter)
+ {
+ datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
+ datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
+
+ compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+ datum2, isnull2,
+ sortKey);
+ }
if (compare != 0 || state->nKeys == 1)
return compare;
/* Compare additional columns the hard way */
nkey = 0;
}
- /* Compare additional sort keys */
- ltup = (HeapTuple) a->tuple;
- rtup = (HeapTuple) b->tuple;
-
if (state->indexInfo->ii_Expressions == NULL)
{
/* If not expression index, just compare the proper heap attrs */
- tupDesc = state->tupDesc;
for (; nkey < state->nKeys; nkey++, sortKey++)
{
AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
- Datum datum1,
- datum2;
- bool isnull1,
- isnull2;
datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
{
HeapTuple tuple = (HeapTuple) tup;
+ Datum original;
/* copy the tuple into sort storage */
tuple = heap_copytuple(tuple);
stup->tuple = (void *) tuple;
USEMEM(state, GetMemoryChunkSpace(tuple));
- /* set up first-column key value, if it's a simple column */
- if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
- stup->datum1 = heap_getattr(tuple,
- state->indexInfo->ii_KeyAttrNumbers[0],
- state->tupDesc,
- &stup->isnull1);
+ /*
+ * set up first-column key value, and potentially abbreviate, if it's a
+ * simple column
+ */
+ if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
+ return;
+
+ original = heap_getattr(tuple,
+ state->indexInfo->ii_KeyAttrNumbers[0],
+ state->tupDesc,
+ &stup->isnull1);
+
+ if (!state->sortKeys->abbrev_converter || stup->isnull1)
+ {
+ /*
+ * Store ordinary Datum representation, or NULL value. If there is a
+ * converter it won't expect NULL values, and cost model is not
+ * required to account for NULL, so in that case we avoid calling
+ * converter and just set datum1 to "void" representation (to be
+ * consistent).
+ */
+ stup->datum1 = original;
+ }
+ else if (!consider_abort_common(state))
+ {
+ /* Store abbreviated key representation */
+ stup->datum1 = state->sortKeys->abbrev_converter(original,
+ state->sortKeys);
+ }
+ else
+ {
+ /* Abort abbreviation */
+ int i;
+
+ stup->datum1 = original;
+
+ /*
+ * Set state to be consistent with never trying abbreviation.
+ *
+ * Alter datum1 representation in already-copied tuples, so as to
+ * ensure a consistent representation (current tuple was just handled).
+ * Note that we rely on all tuples copied so far actually being
+ * contained within memtuples array.
+ */
+ for (i = 0; i < state->memtupcount; i++)
+ {
+ SortTuple *mtup = &state->memtuples[i];
+
+ tuple = (HeapTuple) mtup->tuple;
+ mtup->datum1 = heap_getattr(tuple,
+ state->indexInfo->ii_KeyAttrNumbers[0],
+ state->tupDesc,
+ &stup->isnull1);
+ }
+ }
}
static void
bool equal_hasnull = false;
int nkey;
int32 compare;
+ Datum datum1,
+ datum2;
+ bool isnull1,
+ isnull2;
+
/* Compare the leading sort key */
compare = ApplySortComparator(a->datum1, a->isnull1,
if (compare != 0)
return compare;
- /* they are equal, so we only need to examine one null flag */
- if (a->isnull1)
- equal_hasnull = true;
-
/* Compare additional sort keys */
tuple1 = (IndexTuple) a->tuple;
tuple2 = (IndexTuple) b->tuple;
keysz = state->nKeys;
tupDes = RelationGetDescr(state->indexRel);
+
+ if (sortKey->abbrev_converter)
+ {
+ datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
+ datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
+
+ compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+ datum2, isnull2,
+ sortKey);
+ if (compare != 0)
+ return compare;
+ }
+
+ /* they are equal, so we only need to examine one null flag */
+ if (a->isnull1)
+ equal_hasnull = true;
+
sortKey++;
for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
{
- Datum datum1,
- datum2;
- bool isnull1,
- isnull2;
-
datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
IndexTuple tuple = (IndexTuple) tup;
unsigned int tuplen = IndexTupleSize(tuple);
IndexTuple newtuple;
+ Datum original;
/* copy the tuple into sort storage */
newtuple = (IndexTuple) palloc(tuplen);
USEMEM(state, GetMemoryChunkSpace(newtuple));
stup->tuple = (void *) newtuple;
/* set up first-column key value */
- stup->datum1 = index_getattr(newtuple,
- 1,
- RelationGetDescr(state->indexRel),
- &stup->isnull1);
+ original = index_getattr(newtuple,
+ 1,
+ RelationGetDescr(state->indexRel),
+ &stup->isnull1);
+
+ if (!state->sortKeys->abbrev_converter || stup->isnull1)
+ {
+ /*
+ * Store ordinary Datum representation, or NULL value. If there is a
+ * converter it won't expect NULL values, and cost model is not
+ * required to account for NULL, so in that case we avoid calling
+ * converter and just set datum1 to "void" representation (to be
+ * consistent).
+ */
+ stup->datum1 = original;
+ }
+ else if (!consider_abort_common(state))
+ {
+ /* Store abbreviated key representation */
+ stup->datum1 = state->sortKeys->abbrev_converter(original,
+ state->sortKeys);
+ }
+ else
+ {
+ /* Abort abbreviation */
+ int i;
+
+ stup->datum1 = original;
+
+ /*
+ * Set state to be consistent with never trying abbreviation.
+ *
+ * Alter datum1 representation in already-copied tuples, so as to
+ * ensure a consistent representation (current tuple was just handled).
+ * Note that we rely on all tuples copied so far actually being
+ * contained within memtuples array.
+ */
+ for (i = 0; i < state->memtupcount; i++)
+ {
+ SortTuple *mtup = &state->memtuples[i];
+
+ tuple = (IndexTuple) mtup->tuple;
+ mtup->datum1 = index_getattr(tuple,
+ 1,
+ RelationGetDescr(state->indexRel),
+ &stup->isnull1);
+ }
+ }
}
static void
--- /dev/null
+/*
+ * hyperloglog.h
+ *
+ * A simple HyperLogLog cardinality estimator implementation
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation. The copyright terms of Ohno's
+ * original version (the MIT license) follow.
+ *
+ * src/include/lib/hyperloglog.h
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef HYPERLOGLOG_H
+#define HYPERLOGLOG_H
+
+/*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set. Importantly, it does this by using a fixed amount of
+ * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * hyperLogLogState
+ *
+ * registerWidth register width, in bits ("k")
+ * nRegisters number of registers
+ * alphaMM alpha * m ^ 2 (see initHyperLogLog())
+ * hashesArr array of hashes
+ * arrSize size of hashesArr
+ */
+typedef struct hyperLogLogState
+{
+ uint8 registerWidth;
+ Size nRegisters;
+ double alphaMM;
+ uint8 *hashesArr;
+ Size arrSize;
+} hyperLogLogState;
+
+extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash);
+extern double estimateHyperLogLog(hyperLogLogState *cState);
+extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+
+#endif /* HYPERLOGLOG_H */
#endif
/*
- * Assumed cache line size. This doesn't affect correctness, but can be
- * used for low-level optimizations. Currently, this is only used to pad
- * some data structures in xlog.c, to ensure that highly-contended fields
- * are on different cache lines. Too small a value can hurt performance due
- * to false sharing, while the only downside of too large a value is a few
- * bytes of wasted memory. The default is 128, which should be large enough
- * for all supported platforms.
+ * Assumed cache line size. This doesn't affect correctness, but can be used
+ * for low-level optimizations. Currently, this is used to pad some data
+ * structures in xlog.c, to ensure that highly-contended fields are on
+ * different cache lines. Too small a value can hurt performance due to false
+ * sharing, while the only downside of too large a value is a few bytes of
+ * wasted memory. The default is 128, which should be large enough for all
+ * supported platforms.
*/
#define PG_CACHE_LINE_SIZE 128
* required to provide all of them. The BTSORTSUPPORT function should
* simply not set any function pointers for mechanisms it doesn't support.
* Opclasses that provide BTSORTSUPPORT and don't provide a comparator
- * function will have a shim set up by sort support automatically.
+ * function will have a shim set up by sort support automatically. However,
+ * opclasses that support the optional additional abbreviated key capability
+ * must always provide an authoritative comparator used to tie-break
+ * inconclusive abbreviated comparisons and also used when aborting
+ * abbreviation. Furthermore, a converter and abort/costing function must be
+ * provided.
*
* All sort support functions will be passed the address of the
* SortSupportData struct when called, so they can use it to store
* than, equal to, or greater than y. Note that x and y are guaranteed
* not null, and there is no way to return null either. Do not return
* INT_MIN, as callers are allowed to negate the result before using it.
+ *
+ * This may be either the authoritative comparator, or the abbreviated
+ * comparator. Core code may switch this over the initial preference of an
+ * opclass support function despite originally indicating abbreviation was
+ * applicable, by assigning the authoritative comparator back.
*/
int (*comparator) (Datum x, Datum y, SortSupport ssup);
/*
- * Additional sort-acceleration functions might be added here later.
+ * "Abbreviated key" infrastructure follows.
+ *
+ * All callbacks must be set by sortsupport opclasses that make use of this
+ * optional additional infrastructure (unless for whatever reasons the
+ * opclass doesn't proceed with abbreviation, in which case
+ * abbrev_converter must not be set).
+ *
+ * This allows opclass authors to supply a conversion routine, used to
+ * create an alternative representation of the underlying type (an
+ * "abbreviated key"). Typically, this representation is an ad-hoc,
+ * pass-by-value Datum format that only the opclass has knowledge of. An
+ * alternative comparator, used only with this alternative representation
+ * must also be provided (which is assigned to "comparator"). This
+ * representation is a simple approximation of the original Datum. It must
+ * be possible to compare datums of this representation with each other
+ * using the supplied alternative comparator, and have any non-zero return
+ * value be a reliable proxy for what a proper comparison would indicate.
+ * Returning zero from the alternative comparator does not indicate
+ * equality, as with a conventional support routine 1, though -- it
+ * indicates that it wasn't possible to determine how the two abbreviated
+ * values compared. A proper comparison, using "abbrev_full_comparator"/
+ * ApplySortAbbrevFullComparator() is therefore required. In many cases
+ * this results in most or all comparisons only using the cheap alternative
+ * comparison func, which is typically implemented as code that compiles to
+ * just a few CPU instructions. CPU cache miss penalties are expensive; to
+ * get good overall performance, sort infrastructure must heavily weigh
+ * cache performance.
+ *
+ * Opclass authors must consider the final cardinality of abbreviated keys
+ * when devising an encoding scheme. It's possible for a strategy to work
+ * better than an alternative strategy with one usage pattern, while the
+ * reverse might be true for another usage pattern. All of these factors
+ * must be considered.
*/
+
+ /*
+ * "abbreviate" concerns whether or not the abbreviated key optimization is
+ * applicable in principle (that is, the sortsupport routine needs to know
+ * if its dealing with a key where an abbreviated representation can
+ * usefully be packed together. Conventionally, this is the leading
+ * attribute key). Note, however, that in order to determine that
+ * abbreviation is not in play, the core code always checks whether or not
+ * the opclass has set abbrev_converter. This is a one way, one time
+ * message to the opclass.
+ */
+ bool abbreviate;
+
+ /*
+ * Converter to abbreviated format, from original representation. Core
+ * code uses this callback to convert from a pass-by-reference "original"
+ * Datum to a pass-by-value abbreviated key Datum. Note that original is
+ * guaranteed NOT NULL, because it doesn't make sense to factor NULLness
+ * into ad-hoc cost model.
+ *
+ * abbrev_converter is tested to see if abbreviation is in play. Core code
+ * may set it to NULL to indicate abbreviation should not be used (which is
+ * something sortsupport routines need not concern themselves with).
+ * However, sortsupport routines must not set it when it is immediately
+ * established that abbreviation should not proceed (for abbreviation
+ * calls, or platform-specific impediments to using abbreviation).
+ */
+ Datum (*abbrev_converter) (Datum original, SortSupport ssup);
+
+ /*
+ * abbrev_abort callback allows clients to verify that the current strategy
+ * is working out, using a sortsupport routine defined ad-hoc cost model.
+ * If there is a lot of duplicate abbreviated keys in practice, it's useful
+ * to be able to abandon the strategy before paying too high a cost in
+ * conversion (perhaps certain opclass-specific adaptations are useful
+ * too).
+ */
+ bool (*abbrev_abort) (int memtupcount, SortSupport ssup);
+
+ /*
+ * Full, authoritative comparator for key that an abbreviated
+ * representation was generated for, used when an abbreviated comparison
+ * was inconclusive (by calling ApplySortComparatorFull()), or used to
+ * replace "comparator" when core system ultimately decides against
+ * abbreviation.
+ */
+ int (*abbrev_full_comparator) (Datum x, Datum y, SortSupport ssup);
} SortSupportData;
extern int ApplySortComparator(Datum datum1, bool isNull1,
Datum datum2, bool isNull2,
SortSupport ssup);
+extern int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+ Datum datum2, bool isNull2,
+ SortSupport ssup);
#endif /* !PG_USE_INLINE */
#if defined(PG_USE_INLINE) || defined(SORTSUPPORT_INCLUDE_DEFINITIONS)
/*
return compare;
}
+
+/*
+ * Apply a sort comparator function and return a 3-way comparison using full,
+ * authoritative comparator. This takes care of handling reverse-sort and
+ * NULLs-ordering properly.
+ */
+STATIC_IF_INLINE int
+ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+ Datum datum2, bool isNull2,
+ SortSupport ssup)
+{
+ int compare;
+
+ if (isNull1)
+ {
+ if (isNull2)
+ compare = 0; /* NULL "=" NULL */
+ else if (ssup->ssup_nulls_first)
+ compare = -1; /* NULL "<" NOT_NULL */
+ else
+ compare = 1; /* NULL ">" NOT_NULL */
+ }
+ else if (isNull2)
+ {
+ if (ssup->ssup_nulls_first)
+ compare = 1; /* NOT_NULL ">" NULL */
+ else
+ compare = -1; /* NOT_NULL "<" NULL */
+ }
+ else
+ {
+ compare = (*ssup->abbrev_full_comparator) (datum1, datum2, ssup);
+ if (ssup->ssup_reverse)
+ compare = -compare;
+ }
+
+ return compare;
+}
#endif /*-- PG_USE_INLINE || SORTSUPPORT_INCLUDE_DEFINITIONS */
/* Other functions in utils/sort/sortsupport.c */