Use abbreviated keys for faster sorting of text datums.

author Robert Haas <rhaas@postgresql.org>

Mon, 19 Jan 2015 20:20:31 +0000 (15:20 -0500)

committer Robert Haas <rhaas@postgresql.org>

Mon, 19 Jan 2015 20:28:27 +0000 (15:28 -0500)
author Robert Haas <rhaas@postgresql.org>
Mon, 19 Jan 2015 20:20:31 +0000 (15:20 -0500)
committer Robert Haas <rhaas@postgresql.org>
Mon, 19 Jan 2015 20:28:27 +0000 (15:28 -0500)
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c

index ce8b674e411f487d95ddeb96e6fb58a7e19cde4e..625f490af80367a18253e9d6255066ae0944ca6b 100644 (file)
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -717,6 +717,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
                         sortKey->ssup_nulls_first =
                                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
                         sortKey->ssup_attno = scanKey->sk_attno;
+                       /* Abbreviation is not supported here */
+                       sortKey->abbreviate = false;
  
                         AssertState(sortKey->ssup_attno != 0);
  
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c

index 5de2b39d103a398de0ce3ad6be3bf6e038746e6e..d2856a379e77028cf4f766c54729bb4bc807f034 100644 (file)
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2301,6 +2301,12 @@ compute_scalar_stats(VacAttrStatsP stats,
         /* We always use the default collation for statistics */
         ssup.ssup_collation = DEFAULT_COLLATION_OID;
         ssup.ssup_nulls_first = false;
+       /*
+        * For now, don't perform abbreviated key conversion, because full values
+        * are required for MCV slot generation.  Supporting that optimization
+        * would necessitate teaching compare_scalars() to call a tie-breaker.
+        */
+       ssup.abbreviate = false;
  
         PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
  
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c

index 08088ea3c0c5ee9a36f14cb020253d08b9d3d93e..8079d97764318fdda3f939ae7165e79e220232df 100644 (file)
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -363,6 +363,10 @@ initialize_aggregates(AggState *aggstate,
                          * We use a plain Datum sorter when there's a single input column;
                          * otherwise sort the full tuple.  (See comments for
                          * process_ordered_aggregate_single.)
+                        *
+                        * In the future, we should consider forcing the
+                        * tuplesort_begin_heap() case when the abbreviated key
+                        * optimization can thereby be used, even when numInputs is 1.
                          */
                         peraggstate->sortstate =
                                 (peraggstate->numInputs == 1) ?
diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c

index 4e200a8e34687c9a90c9133e0c929b145fb93d8c..0c814f0e72d175550c4fdb2bcf6c2da25cc42fe2 100644 (file)
--- a/src/backend/executor/nodeMergeAppend.c
+++ b/src/backend/executor/nodeMergeAppend.c
@@ -137,6 +137,15 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags)
                 sortKey->ssup_nulls_first = node->nullsFirst[i];
                 sortKey->ssup_attno = node->sortColIdx[i];
  
+               /*
+                * It isn't feasible to perform abbreviated key conversion, since
+                * tuples are pulled into mergestate's binary heap as needed.  It would
+                * likely be counter-productive to convert tuples into an abbreviated
+                * representation as they're pulled up, so opt out of that additional
+                * optimization entirely.
+                */
+               sortKey->abbreviate = false;
+
                 PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
         }
  
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c

index 2a5a7acf945d2973eb088684f462026b7d300885..15742c574adf17e80fc3d36b174e02097f9907cd 100644 (file)
--- a/src/backend/executor/nodeMergejoin.c
+++ b/src/backend/executor/nodeMergejoin.c
@@ -229,6 +229,14 @@ MJExamineQuals(List *mergeclauses,
                         elog(ERROR, "cannot merge using non-equality operator %u",
                                  qual->opno);
  
+               /*
+                * sortsupport routine must know if abbreviation optimization is
+                * applicable in principle.  It is never applicable for merge joins
+                * because there is no convenient opportunity to convert to alternative
+                * representation.
+                */
+               clause->ssup.abbreviate = false;
+
                 /* And get the matching support or comparison function */
                 Assert(clause->ssup.comparator == NULL);
                 sortfunc = get_opfamily_proc(opfamily,
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile

index 949ee5e41d0e99ec87a6e352b2cfcc36ee0f7452..fe4781a8e88bd4790ebe4733ab38d444b1b60460 100644 (file)
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/lib
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
-OBJS = ilist.o binaryheap.o pairingheap.o rbtree.o stringinfo.o
+OBJS = ilist.o binaryheap.o hyperloglog.o pairingheap.o rbtree.o stringinfo.o
  
  include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c

new file mode 100644 (file)

index 0000000..1157e9a
--- /dev/null
+++ b/src/backend/lib/hyperloglog.c
@@ -0,0 +1,228 @@
+/*-------------------------------------------------------------------------
+ *
+ * hyperloglog.c
+ *       HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *       src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/hyperloglog.h"
+
+#define POW_2_32                       (4294967296.0)
+#define NEG_POW_2_32           (-4294967296.0)
+
+static inline uint8 rho(uint32 x, uint8 b);
+
+/*
+ * Initialize HyperLogLog track state
+ *
+ * bwidth is bit width (so register size will be 2 to the power of bwidth).
+ * Must be between 4 and 16 inclusive.
+ */
+void
+initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+{
+       double          alpha;
+
+       if (bwidth < 4 || bwidth > 16)
+               elog(ERROR, "bit width must be between 4 and 16 inclusive");
+
+       cState->registerWidth = bwidth;
+       cState->nRegisters = 1 << bwidth;
+       cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+
+       /*
+        * Initialize hashes array to zero, not negative infinity, per discussion
+        * of the coupon collector problem in the HyperLogLog paper
+        */
+       cState->hashesArr = palloc0(cState->arrSize);
+
+       /*
+        * "alpha" is a value that for each possible number of registers (m) is
+        * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+        * is "the indicator function" through which we finally compute E,
+        * estimated cardinality).
+        */
+       switch (cState->nRegisters)
+       {
+               case 16:
+                       alpha = 0.673;
+                       break;
+               case 32:
+                       alpha = 0.697;
+                       break;
+               case 64:
+                       alpha = 0.709;
+                       break;
+               default:
+                       alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+       }
+
+       /*
+        * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+        * estimate E
+        */
+       cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any().  The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging.  There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+{
+       uint8           count;
+       uint32          index;
+
+       /* Use the first "k" (registerWidth) bits as a zero based index */
+       index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+       /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+       count = rho(hash << cState->registerWidth,
+                               BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+       cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+}
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateHyperLogLog(hyperLogLogState *cState)
+{
+       double          result;
+       double          sum = 0.0;
+       int                     i;
+
+       for (i = 0; i < cState->nRegisters; i++)
+       {
+               sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+       }
+
+       /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+       result = cState->alphaMM / sum;
+
+       if (result <= (5.0 / 2.0) * cState->nRegisters)
+       {
+               /* Small range correction */
+               int     zero_count = 0;
+
+               for (i = 0; i < cState->nRegisters; i++)
+               {
+                       if (cState->hashesArr[i] == 0)
+                               zero_count++;
+               }
+
+               if (zero_count != 0)
+                       result = cState->nRegisters * log((double) cState->nRegisters /
+                                                                                         zero_count);
+       }
+       else if (result > (1.0 / 30.0) * POW_2_32)
+       {
+               /* Large range correction */
+               result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+       }
+
+       return result;
+}
+
+/*
+ * Merges the estimate from one HyperLogLog state to another, returning the
+ * estimate of their union.
+ *
+ * The number of registers in each must match.
+ */
+void
+mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState)
+{
+       int             r;
+
+       if (cState->nRegisters != oState->nRegisters)
+               elog(ERROR, "number of registers mismatch: %zu != %zu",
+                        cState->nRegisters, oState->nRegisters);
+
+       for (r = 0; r < cState->nRegisters; ++r)
+       {
+               cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]);
+       }
+}
+
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000)   returns 1
+ * rho(x = 0b0010000000)   returns 3
+ * rho(x = 0b0000000000)   returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+       uint8   j = 1;
+
+       while (j <= b && !(x & 0x80000000))
+       {
+               j++;
+               x <<= 1;
+       }
+
+       return j;
+}
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c

index 869a83b185a17bc25a89c2739b407544f8782c9d..f9a5f7f93faf2446ffc59bc72163e489498f827b 100644 (file)
--- a/src/backend/utils/adt/orderedsetaggs.c
+++ b/src/backend/utils/adt/orderedsetaggs.c
@@ -266,7 +266,13 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
         osastate->qstate = qstate;
         osastate->gcontext = gcontext;
  
-       /* Initialize tuplesort object */
+       /*
+        * Initialize tuplesort object.
+        *
+        * In the future, we should consider forcing the tuplesort_begin_heap()
+        * case when the abbreviated key optimization can thereby be used, even
+        * when !use_tuples.
+        */
         if (use_tuples)
                 osastate->sortstate = tuplesort_begin_heap(qstate->tupdesc,
                                                                                                    qstate->numSortCols,
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index e95ed88366fd9265e26af26978d99410f39cb7a5..71d47380ac61ffff19cdfaa7090fdaceaedf925a 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -17,9 +17,11 @@
  #include <ctype.h>
  #include <limits.h>
  
+#include "access/hash.h"
  #include "access/tuptoaster.h"
  #include "catalog/pg_collation.h"
  #include "catalog/pg_type.h"
+#include "lib/hyperloglog.h"
  #include "libpq/md5.h"
  #include "libpq/pqformat.h"
  #include "miscadmin.h"
@@ -32,6 +34,9 @@
  #include "utils/pg_locale.h"
  #include "utils/sortsupport.h"
  
+#ifdef DEBUG_ABBREV_KEYS
+#define DEBUG_elog_output      DEBUG1
+#endif
  
  /* GUC variable */
  int                    bytea_output = BYTEA_OUTPUT_HEX;
@@ -54,10 +59,12 @@ typedef struct
  
  typedef struct
  {
-       char                       *buf1;               /* 1st string */
-       char                       *buf2;               /* 2nd string */
+       char                       *buf1;               /* 1st string, or abbreviation original string buf */
+       char                       *buf2;               /* 2nd string, or abbreviation strxfrm() buf */
         int                                     buflen1;
         int                                     buflen2;
+       hyperLogLogState        abbr_card;      /* Abbreviated key cardinality state */
+       hyperLogLogState        full_card;      /* Full key cardinality state */
  #ifdef HAVE_LOCALE_T
         pg_locale_t locale;
  #endif
@@ -78,6 +85,9 @@ typedef struct
  static void btsortsupport_worker(SortSupport ssup, Oid collid);
  static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
  static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
+static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
+static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
+static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
  static int32 text_length(Datum str);
  static text *text_catenate(text *t1, text *t2);
  static text *text_substring(Datum str,
@@ -1735,27 +1745,54 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
  {
         TextSortSupport    *tss;
  
-       /*
-        * If LC_COLLATE = C, we can make things quite a bit faster by using
-        * memcmp() rather than strcoll().  To minimize the per-comparison
-        * overhead, we make this decision just once for the whole sort.
-        */
-       if (lc_collate_is_c(collid))
-       {
-               ssup->comparator = bttextfastcmp_c;
-               return;
-       }
-
         /*
          * WIN32 requires complex hacks when the database encoding is UTF-8 (except
          * when using the "C" collation).  For now, we don't optimize that case.
          */
  #ifdef WIN32
-       if (GetDatabaseEncoding() == PG_UTF8)
+       if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid))
                 return;
  #endif
  
         /*
+        * On platforms where the abbreviated key for text optimization might have
+        * bad worst case performance, it may be useful to avoid it entirely by
+        * disabling it at compile time.  Having only 4 byte datums could make
+        * worst-case performance drastically more likely, for example.  Moreover,
+        * Darwin's strxfrm() implementations is known to not effectively
+        * concentrate a significant amount of entropy from the original string in
+        * earlier transformed blobs.  It's possible that other supported platforms
+        * are similarly encumbered.
+        *
+        * Any reasonable implementation will pack primary weights into the start
+        * of returned blobs.  The canonical algorithm's implementation is
+        * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION
+        * ALGORITHM"), section 4, "Main algorithm".  Section 4.3, "Form Sort Key"
+        * is of particular interest:
+        *
+        * http://www.unicode.org/reports/tr10/#Step_3
+        *
+        * The collation algorithm standard goes on to state:
+        *
+        * "By default, the algorithm makes use of three fully-customizable levels.
+        * For the Latin script, these levels correspond roughly to:
+        *
+        * alphabetic ordering
+        *
+        * diacritic ordering
+        *
+        * case ordering.
+        *
+        * A final level may be used for tie-breaking between strings not otherwise
+        * distinguished."
+        *
+        * It is generally expected that most non-equal keys will have their
+        * comparisons resolved at the primary level.  If enough comparisons can be
+        * resolved with just 4 or 8 byte abbreviated keys, this optimization is
+        * very effective (although if there are many tie-breakers that largely
+        * only perform cheap memcmp() calls, that is also much faster than the
+        * unoptimized case - see bttext_abbrev_abort()).
+        *
          * We may need a collation-sensitive comparison.  To make things faster,
          * we'll figure out the collation based on the locale id and cache the
          * result.  Also, since strxfrm()/strcoll() require NUL-terminated inputs,
@@ -1788,13 +1825,47 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
  #endif
         }
  
-       tss->buf1 = palloc(TEXTBUFLEN);
-       tss->buflen1 = TEXTBUFLEN;
-       tss->buf2 = palloc(TEXTBUFLEN);
-       tss->buflen2 = TEXTBUFLEN;
+       /*
+        * If LC_COLLATE = C, we can make things quite a bit faster by using
+        * memcmp() rather than strcoll().  To minimize the per-comparison
+        * overhead, we make this decision just once for the whole sort.
+        *
+        * There is no reason to not at least perform fmgr elision on builds where
+        * abbreviation is disabled.
+        */
+       if (lc_collate_is_c(collid))
+               ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_c;
+       else
+               ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_locale;
+
+       if (!lc_collate_is_c(collid) || ssup->abbreviate)
+       {
+               /*
+                * Abbreviated case requires temp buffers for strxfrm() copying.
+                * bttextfastcmp_locale() also uses these buffers (even if abbreviation
+                * isn't used), while bttextfast_c() does not.
+                */
+               tss->buf1 = palloc(TEXTBUFLEN);
+               tss->buflen1 = TEXTBUFLEN;
+               tss->buf2 = palloc(TEXTBUFLEN);
+               tss->buflen2 = TEXTBUFLEN;
+               ssup->ssup_extra = tss;
+       }
+
+       if (!ssup->abbreviate)
+               return;
  
-       ssup->ssup_extra = tss;
-       ssup->comparator = bttextfastcmp_locale;
+       initHyperLogLog(&tss->abbr_card, 10);
+       initHyperLogLog(&tss->full_card, 10);
+
+       /*
+        * Change comparator to be abbreviation-based -- abbreviated version will
+        * probably ultimately be used during sorting proper, but core code may
+        * switch back to authoritative comparator should abbreviation be aborted
+        */
+       ssup->comparator = bttextcmp_abbrev;
+       ssup->abbrev_converter = bttext_abbrev_convert;
+       ssup->abbrev_abort = bttext_abbrev_abort;
  }
  
  /*
@@ -1903,6 +1974,225 @@ done:
         return result;
  }
  
+/*
+ * Abbreviated key comparison func
+ */
+static int
+bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+       char   *a = (char *) &x;
+       char   *b = (char *) &y;
+       int     result;
+
+       result = memcmp(a, b, sizeof(Datum));
+
+       /*
+        * When result = 0, the core system will call bttextfastcmp_c() or
+        * bttextfastcmp_locale().  Even a strcmp() on two non-truncated strxfrm()
+        * blobs cannot indicate *equality* authoritatively, for the same reason
+        * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp().
+        */
+       return result;
+}
+
+/*
+ * Conversion routine for sortsupport.  Converts original text to abbreviated
+ * key representation.  Our encoding strategy is simple -- pack the first 8
+ * bytes of a strxfrm() blob into a Datum.
+ */
+static Datum
+bttext_abbrev_convert(Datum original, SortSupport ssup)
+{
+       TextSortSupport    *tss = (TextSortSupport *) ssup->ssup_extra;
+       text                       *authoritative = DatumGetTextPP(original);
+
+       /* working state */
+       Datum                           res;
+       char                       *pres;
+       int                                     len;
+       Size                            bsize;
+       uint32                          hash;
+
+       /*
+        * Abbreviated key representation is a pass-by-value Datum that is treated
+        * as a char array by the specialized comparator bttextcmp_abbrev().
+        */
+       pres = (char *) &res;
+       /* memset(), so any non-overwritten bytes are NUL */
+       memset(pres, 0, sizeof(Datum));
+       len = VARSIZE_ANY_EXHDR(authoritative);
+
+       /* By convention, we use buffer 1 to store and NUL-terminate text */
+       if (len >= tss->buflen1)
+       {
+               pfree(tss->buf1);
+               tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
+               tss->buf1 = palloc(tss->buflen1);
+       }
+
+       /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
+       memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
+       tss->buf1[len] = '\0';
+
+       /* Don't leak memory here */
+       if (PointerGetDatum(authoritative) != original)
+               pfree(authoritative);
+
+retry:
+
+       /*
+        * There is no special handling of the C locale here, unlike with
+        * varstr_cmp().  strxfrm() is used indifferently.
+        */
+#ifdef HAVE_LOCALE_T
+       if (tss->locale)
+               bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale);
+       else
+#endif
+               bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
+
+       if (bsize >= tss->buflen2)
+       {
+               /*
+                * The C standard states that the contents of the buffer is now
+                * unspecified.  Grow buffer, and retry.
+                */
+               pfree(tss->buf2);
+               tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+               tss->buf2 = palloc(tss->buflen2);
+               goto retry;
+       }
+
+       /*
+        * Maintain approximate cardinality of both abbreviated keys and original,
+        * authoritative keys using HyperLogLog.  Used as cheap insurance against
+        * the worst case, where we do many string transformations for no saving in
+        * full strcoll()-based comparisons.  These statistics are used by
+        * bttext_abbrev_abort().
+        *
+        * First, Hash key proper, or a significant fraction of it.  Mix in length
+        * in order to compensate for cases where differences are past
+        * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+        */
+       hash = hash_any((unsigned char *) tss->buf1, Min(len, PG_CACHE_LINE_SIZE));
+
+       if (len > PG_CACHE_LINE_SIZE)
+               hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+
+       addHyperLogLog(&tss->full_card, hash);
+
+       memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
+
+       /* Hash abbreviated key */
+#if SIZEOF_DATUM == 8
+       {
+               uint32                          lohalf,
+                                                       hihalf;
+
+               lohalf = (uint32) res;
+               hihalf = (uint32) (res >> 32);
+               hash = hash_uint32(lohalf ^ hihalf);
+       }
+#else                                                  /* SIZEOF_DATUM != 8 */
+       hash = hash_uint32((uint32) res);
+#endif
+
+       addHyperLogLog(&tss->abbr_card, hash);
+
+       /*
+        * Every Datum byte is always compared.  This is safe because the strxfrm()
+        * blob is itself NUL terminated, leaving no danger of misinterpreting any
+        * NUL bytes not intended to be interpreted as logically representing
+        * termination.
+        */
+       return res;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization, using
+ * heuristic rules.  Returns value indicating if the abbreviation optimization
+ * should be aborted, based on its projected effectiveness.
+ */
+static bool
+bttext_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+       TextSortSupport    *tss = (TextSortSupport *) ssup->ssup_extra;
+       double                          abbrev_distinct, key_distinct;
+
+       Assert(ssup->abbreviate);
+
+       /* Have a little patience */
+       if (memtupcount < 20)
+               return false;
+
+       abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
+       key_distinct = estimateHyperLogLog(&tss->full_card);
+
+       /*
+        * Clamp cardinality estimates to at least one distinct value.  While NULLs
+        * are generally disregarded, if only NULL values were seen so far, that
+        * might misrepresent costs if we failed to clamp.
+        */
+       if (abbrev_distinct <= 1.0)
+               abbrev_distinct = 1.0;
+
+       if (key_distinct <= 1.0)
+               key_distinct = 1.0;
+
+       /*
+        * In the worst case all abbreviated keys are identical, while at the same
+        * time there are differences within full key strings not captured in
+        * abbreviations.
+        */
+#ifdef DEBUG_ABBREV_KEYS
+       {
+               double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+
+               elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
+                        memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
+       }
+#endif
+
+       /*
+        * If the number of distinct abbreviated keys approximately matches the
+        * number of distinct authoritative original keys, that's reason enough to
+        * proceed.  We can win even with a very low cardinality set if most
+        * tie-breakers only memcmp().  This is by far the most important
+        * consideration.
+        *
+        * While comparisons that are resolved at the abbreviated key level are
+        * considerably cheaper than tie-breakers resolved with memcmp(), both of
+        * those two outcomes are so much cheaper than a full strcoll() once
+        * sorting is underway that it doesn't seem worth it to weigh abbreviated
+        * cardinality against the overall size of the set in order to more
+        * accurately model costs.  Assume that an abbreviated comparison, and an
+        * abbreviated comparison with a cheap memcmp()-based authoritative
+        * resolution are equivalent.
+        */
+       if (abbrev_distinct > key_distinct * 0.05)
+               return false;
+
+       /*
+        * Abort abbreviation strategy.
+        *
+        * The worst case, where all abbreviated keys are identical while all
+        * original strings differ will typically only see a regression of about
+        * 10% in execution time for small to medium sized lists of strings.
+        * Whereas on modern CPUs where cache stalls are the dominant cost, we can
+        * often expect very large improvements, particularly with sets of strings
+        * of moderately high to high abbreviated cardinality.  There is little to
+        * lose but much to gain, which our strategy reflects.
+        */
+#ifdef DEBUG_ABBREV_KEYS
+       elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
+                memtupcount, abbrev_distinct, key_distinct);
+       /* Actually abort only when debugging is disabled */
+       return false;
+#endif
+
+       return true;
+}
+
  Datum
  text_larger(PG_FUNCTION_ARGS)
  {
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c

index 4aa39728416fdc50a0a2fa01cb5677e4064f69d9..6d3aa889bc5eb2395688ec53d319d4e486e0de6e 100644 (file)
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -150,7 +150,10 @@ bool               optimize_bounded_sort = true;
   * When sorting single Datums, the data value is represented directly by
   * datum1/isnull1.  If the datatype is pass-by-reference and isnull1 is false,
   * then datum1 points to a separately palloc'd data value that is also pointed
- * to by the "tuple" pointer; otherwise "tuple" is NULL.
+ * to by the "tuple" pointer; otherwise "tuple" is NULL.  There is one special
+ * case:  when the sort support infrastructure provides an "abbreviated key"
+ * representation, where the key is (typically) a pass by value proxy for a
+ * pass by reference type.
   *
   * While building initial runs, tupindex holds the tuple's run number.  During
   * merge passes, we re-use it to hold the input tape number that each tuple in
@@ -346,6 +349,14 @@ struct Tuplesortstate
          */
         SortSupport onlyKey;
  
+       /*
+        * Additional state for managing "abbreviated key" sortsupport routines
+        * (which currently may be used by all cases except the Datum sort case and
+        * hash index case).  Tracks the intervals at which the optimization's
+        * effectiveness is tested.
+        */
+       int64           abbrevNext;             /* Tuple # at which to next check applicability */
+
         /*
          * These variables are specific to the CLUSTER case; they are set by
          * tuplesort_begin_cluster.
@@ -442,6 +453,7 @@ struct Tuplesortstate
  
  static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
  static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
+static bool consider_abort_common(Tuplesortstate *state);
  static void inittapes(Tuplesortstate *state);
  static void selectnewtape(Tuplesortstate *state);
  static void mergeruns(Tuplesortstate *state);
@@ -619,6 +631,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
         state->readtup = readtup_heap;
  
         state->tupDesc = tupDesc;       /* assume we need not copy tupDesc */
+       state->abbrevNext = 10;
  
         /* Prepare SortSupport data for each column */
         state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
@@ -634,11 +647,19 @@ tuplesort_begin_heap(TupleDesc tupDesc,
                 sortKey->ssup_collation = sortCollations[i];
                 sortKey->ssup_nulls_first = nullsFirstFlags[i];
                 sortKey->ssup_attno = attNums[i];
+               /* Convey if abbreviation optimization is applicable in principle */
+               sortKey->abbreviate = (i == 0);
  
                 PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
         }
  
-       if (nkeys == 1)
+       /*
+        * The "onlyKey" optimization cannot be used with abbreviated keys, since
+        * tie-breaker comparisons may be required.  Typically, the optimization is
+        * only of value to pass-by-value types anyway, whereas abbreviated keys
+        * are typically only of value to pass-by-reference types.
+        */
+       if (nkeys == 1 && !state->sortKeys->abbrev_converter)
                 state->onlyKey = state->sortKeys;
  
         MemoryContextSwitchTo(oldcontext);
@@ -680,6 +701,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
         state->copytup = copytup_cluster;
         state->writetup = writetup_cluster;
         state->readtup = readtup_cluster;
+       state->abbrevNext = 10;
  
         state->indexInfo = BuildIndexInfo(indexRel);
  
@@ -719,6 +741,8 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
                 sortKey->ssup_nulls_first =
                         (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
                 sortKey->ssup_attno = scanKey->sk_attno;
+               /* Convey if abbreviation optimization is applicable in principle */
+               sortKey->abbreviate = (i == 0);
  
                 AssertState(sortKey->ssup_attno != 0);
  
@@ -768,6 +792,7 @@ tuplesort_begin_index_btree(Relation heapRel,
         state->copytup = copytup_index;
         state->writetup = writetup_index;
         state->readtup = readtup_index;
+       state->abbrevNext = 10;
  
         state->heapRel = heapRel;
         state->indexRel = indexRel;
@@ -791,6 +816,8 @@ tuplesort_begin_index_btree(Relation heapRel,
                 sortKey->ssup_nulls_first =
                         (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
                 sortKey->ssup_attno = scanKey->sk_attno;
+               /* Convey if abbreviation optimization is applicable in principle */
+               sortKey->abbreviate = (i == 0);
  
                 AssertState(sortKey->ssup_attno != 0);
  
@@ -883,6 +910,13 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
         state->onlyKey->ssup_cxt = CurrentMemoryContext;
         state->onlyKey->ssup_collation = sortCollation;
         state->onlyKey->ssup_nulls_first = nullsFirstFlag;
+       /*
+        * Conversion to abbreviated representation infeasible in the Datum case.
+        * It must be possible to subsequently fetch original datum values within
+        * tuplesort_getdatum(), which would require special-case preservation of
+        * original values.
+        */
+       state->onlyKey->abbreviate = false;
  
         PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey);
  
@@ -928,6 +962,19 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound)
  
         state->bounded = true;
         state->bound = (int) bound;
+
+       /*
+        * Bounded sorts are not an effective target for abbreviated key
+        * optimization.  Disable by setting state to be consistent with no
+        * abbreviation support.
+        */
+       state->sortKeys->abbrev_converter = NULL;
+       if (state->sortKeys->abbrev_full_comparator)
+               state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
+
+       /* Not strictly necessary, but be tidy */
+       state->sortKeys->abbrev_abort = NULL;
+       state->sortKeys->abbrev_full_comparator = NULL;
  }
  
  /*
@@ -1186,15 +1233,63 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
  {
         MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
         SortTuple       stup;
+       Datum           original;
+       IndexTuple      tuple;
  
         stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
-       ((IndexTuple) stup.tuple)->t_tid = *self;
+       tuple = ((IndexTuple) stup.tuple);
+       tuple->t_tid = *self;
         USEMEM(state, GetMemoryChunkSpace(stup.tuple));
         /* set up first-column key value */
-       stup.datum1 = index_getattr((IndexTuple) stup.tuple,
-                                                               1,
-                                                               RelationGetDescr(state->indexRel),
-                                                               &stup.isnull1);
+       original = index_getattr(tuple,
+                                                        1,
+                                                        RelationGetDescr(state->indexRel),
+                                                        &stup.isnull1);
+
+       if (!state->sortKeys->abbrev_converter || stup.isnull1)
+       {
+               /*
+                * Store ordinary Datum representation, or NULL value.  If there is a
+                * converter it won't expect NULL values, and cost model is not
+                * required to account for NULL, so in that case we avoid calling
+                * converter and just set datum1 to "void" representation (to be
+                * consistent).
+                */
+               stup.datum1 = original;
+       }
+       else if (!consider_abort_common(state))
+       {
+               /* Store abbreviated key representation */
+               stup.datum1 = state->sortKeys->abbrev_converter(original,
+                                                                                                               state->sortKeys);
+       }
+       else
+       {
+               /* Abort abbreviation */
+               int             i;
+
+               stup.datum1 = original;
+
+               /*
+                * Set state to be consistent with never trying abbreviation.
+                *
+                * Alter datum1 representation in already-copied tuples, so as to
+                * ensure a consistent representation (current tuple was just handled).
+                * Note that we rely on all tuples copied so far actually being
+                * contained within memtuples array.
+                */
+               for (i = 0; i < state->memtupcount; i++)
+               {
+                       SortTuple *mtup = &state->memtuples[i];
+
+                       tuple = mtup->tuple;
+                       mtup->datum1 = index_getattr(tuple,
+                                                                                1,
+                                                                                RelationGetDescr(state->indexRel),
+                                                                                &stup.isnull1);
+               }
+       }
+
         puttuple_common(state, &stup);
  
         MemoryContextSwitchTo(oldcontext);
@@ -1359,6 +1454,47 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
         }
  }
  
+static bool
+consider_abort_common(Tuplesortstate *state)
+{
+       Assert(state->sortKeys[0].abbrev_converter != NULL);
+       Assert(state->sortKeys[0].abbrev_abort != NULL);
+       Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
+
+       /*
+        * Check effectiveness of abbreviation optimization.  Consider aborting
+        * when still within memory limit.
+        */
+       if (state->status == TSS_INITIAL &&
+               state->memtupcount >= state->abbrevNext)
+       {
+               state->abbrevNext *= 2;
+
+               /*
+                * Check opclass-supplied abbreviation abort routine.  It may
+                * indicate that abbreviation should not proceed.
+                */
+               if (!state->sortKeys->abbrev_abort(state->memtupcount,
+                                                                                  state->sortKeys))
+                       return false;
+
+               /*
+                * Finally, restore authoritative comparator, and indicate that
+                * abbreviation is not in play by setting abbrev_converter to NULL
+                */
+               state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
+               state->sortKeys[0].abbrev_converter = NULL;
+               /* Not strictly necessary, but be tidy */
+               state->sortKeys[0].abbrev_abort = NULL;
+               state->sortKeys[0].abbrev_full_comparator = NULL;
+
+               /* Give up - expect original pass-by-value representation */
+               return true;
+       }
+
+       return false;
+}
+
  /*
   * All tuples have been provided; finish the sort.
   */
@@ -2853,6 +2989,12 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
         TupleDesc       tupDesc;
         int                     nkey;
         int32           compare;
+       AttrNumber      attno;
+       Datum           datum1,
+                               datum2;
+       bool            isnull1,
+                               isnull2;
+
  
         /* Compare the leading sort key */
         compare = ApplySortComparator(a->datum1, a->isnull1,
@@ -2867,14 +3009,25 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
         rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
         rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
         tupDesc = state->tupDesc;
+
+       if (sortKey->abbrev_converter)
+       {
+               attno = sortKey->ssup_attno;
+
+               datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
+               datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
+
+               compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+                                                                                               datum2, isnull2,
+                                                                                               sortKey);
+               if (compare != 0)
+                       return compare;
+       }
+
         sortKey++;
         for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
         {
-               AttrNumber      attno = sortKey->ssup_attno;
-               Datum           datum1,
-                                       datum2;
-               bool            isnull1,
-                                       isnull2;
+               attno = sortKey->ssup_attno;
  
                 datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
                 datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
@@ -2897,6 +3050,7 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
          * MinimalTuple using the exported interface for that.
          */
         TupleTableSlot *slot = (TupleTableSlot *) tup;
+       Datum                   original;
         MinimalTuple tuple;
         HeapTupleData htup;
  
@@ -2907,10 +3061,58 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
         /* set up first-column key value */
         htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
         htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
-       stup->datum1 = heap_getattr(&htup,
-                                                               state->sortKeys[0].ssup_attno,
-                                                               state->tupDesc,
-                                                               &stup->isnull1);
+       original = heap_getattr(&htup,
+                                                       state->sortKeys[0].ssup_attno,
+                                                       state->tupDesc,
+                                                       &stup->isnull1);
+
+       if (!state->sortKeys->abbrev_converter || stup->isnull1)
+       {
+               /*
+                * Store ordinary Datum representation, or NULL value.  If there is a
+                * converter it won't expect NULL values, and cost model is not
+                * required to account for NULL, so in that case we avoid calling
+                * converter and just set datum1 to "void" representation (to be
+                * consistent).
+                */
+               stup->datum1 = original;
+       }
+       else if (!consider_abort_common(state))
+       {
+               /* Store abbreviated key representation */
+               stup->datum1 = state->sortKeys->abbrev_converter(original,
+                                                                                                                state->sortKeys);
+       }
+       else
+       {
+               /* Abort abbreviation */
+               int             i;
+
+               stup->datum1 = original;
+
+               /*
+                * Set state to be consistent with never trying abbreviation.
+                *
+                * Alter datum1 representation in already-copied tuples, so as to
+                * ensure a consistent representation (current tuple was just handled).
+                * Note that we rely on all tuples copied so far actually being
+                * contained within memtuples array.
+                */
+               for (i = 0; i < state->memtupcount; i++)
+               {
+                       SortTuple *mtup = &state->memtuples[i];
+
+                       htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
+                                                       MINIMAL_TUPLE_OFFSET;
+                       htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
+                                                       MINIMAL_TUPLE_OFFSET);
+
+                       mtup->datum1 = heap_getattr(&htup,
+                                                                               state->sortKeys[0].ssup_attno,
+                                                                               state->tupDesc,
+                                                                               &mtup->isnull1);
+               }
+       }
  }
  
  static void
@@ -2980,13 +3182,35 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b,
         TupleDesc       tupDesc;
         int                     nkey;
         int32           compare;
+       Datum           datum1,
+                               datum2;
+       bool            isnull1,
+                               isnull2;
+       AttrNumber      leading = state->indexInfo->ii_KeyAttrNumbers[0];
+
+       /* Be prepared to compare additional sort keys */
+       ltup = (HeapTuple) a->tuple;
+       rtup = (HeapTuple) b->tuple;
+       tupDesc = state->tupDesc;
  
         /* Compare the leading sort key, if it's simple */
-       if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
+       if (leading != 0)
         {
                 compare = ApplySortComparator(a->datum1, a->isnull1,
                                                                           b->datum1, b->isnull1,
                                                                           sortKey);
+               if (compare != 0)
+                       return compare;
+
+               if (sortKey->abbrev_converter)
+               {
+                       datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
+                       datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
+
+                       compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+                                                                                                       datum2, isnull2,
+                                                                                                       sortKey);
+               }
                 if (compare != 0 || state->nKeys == 1)
                         return compare;
                 /* Compare additional columns the hard way */
@@ -2999,22 +3223,13 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b,
                 nkey = 0;
         }
  
-       /* Compare additional sort keys */
-       ltup = (HeapTuple) a->tuple;
-       rtup = (HeapTuple) b->tuple;
-
         if (state->indexInfo->ii_Expressions == NULL)
         {
                 /* If not expression index, just compare the proper heap attrs */
-               tupDesc = state->tupDesc;
  
                 for (; nkey < state->nKeys; nkey++, sortKey++)
                 {
                         AttrNumber      attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
-                       Datum           datum1,
-                                               datum2;
-                       bool            isnull1,
-                                               isnull2;
  
                         datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
                         datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
@@ -3072,17 +3287,67 @@ static void
  copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
  {
         HeapTuple       tuple = (HeapTuple) tup;
+       Datum           original;
  
         /* copy the tuple into sort storage */
         tuple = heap_copytuple(tuple);
         stup->tuple = (void *) tuple;
         USEMEM(state, GetMemoryChunkSpace(tuple));
-       /* set up first-column key value, if it's a simple column */
-       if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
-               stup->datum1 = heap_getattr(tuple,
-                                                                       state->indexInfo->ii_KeyAttrNumbers[0],
-                                                                       state->tupDesc,
-                                                                       &stup->isnull1);
+       /*
+        * set up first-column key value, and potentially abbreviate, if it's a
+        * simple column
+        */
+       if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
+               return;
+
+       original = heap_getattr(tuple,
+                                                       state->indexInfo->ii_KeyAttrNumbers[0],
+                                                       state->tupDesc,
+                                                       &stup->isnull1);
+
+       if (!state->sortKeys->abbrev_converter || stup->isnull1)
+       {
+               /*
+                * Store ordinary Datum representation, or NULL value.  If there is a
+                * converter it won't expect NULL values, and cost model is not
+                * required to account for NULL, so in that case we avoid calling
+                * converter and just set datum1 to "void" representation (to be
+                * consistent).
+                */
+               stup->datum1 = original;
+       }
+       else if (!consider_abort_common(state))
+       {
+               /* Store abbreviated key representation */
+               stup->datum1 = state->sortKeys->abbrev_converter(original,
+                                                                                                                state->sortKeys);
+       }
+       else
+       {
+               /* Abort abbreviation */
+               int             i;
+
+               stup->datum1 = original;
+
+               /*
+                * Set state to be consistent with never trying abbreviation.
+                *
+                * Alter datum1 representation in already-copied tuples, so as to
+                * ensure a consistent representation (current tuple was just handled).
+                * Note that we rely on all tuples copied so far actually being
+                * contained within memtuples array.
+                */
+               for (i = 0; i < state->memtupcount; i++)
+               {
+                       SortTuple *mtup = &state->memtuples[i];
+
+                       tuple = (HeapTuple) mtup->tuple;
+                       mtup->datum1 = heap_getattr(tuple,
+                                                                               state->indexInfo->ii_KeyAttrNumbers[0],
+                                                                               state->tupDesc,
+                                                                               &stup->isnull1);
+               }
+       }
  }
  
  static void
@@ -3162,6 +3427,11 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
         bool            equal_hasnull = false;
         int                     nkey;
         int32           compare;
+       Datum           datum1,
+                               datum2;
+       bool            isnull1,
+                               isnull2;
+
  
         /* Compare the leading sort key */
         compare = ApplySortComparator(a->datum1, a->isnull1,
@@ -3170,23 +3440,31 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
         if (compare != 0)
                 return compare;
  
-       /* they are equal, so we only need to examine one null flag */
-       if (a->isnull1)
-               equal_hasnull = true;
-
         /* Compare additional sort keys */
         tuple1 = (IndexTuple) a->tuple;
         tuple2 = (IndexTuple) b->tuple;
         keysz = state->nKeys;
         tupDes = RelationGetDescr(state->indexRel);
+
+       if (sortKey->abbrev_converter)
+       {
+               datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
+               datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
+
+               compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+                                                                                               datum2, isnull2,
+                                                                                               sortKey);
+               if (compare != 0)
+                       return compare;
+       }
+
+       /* they are equal, so we only need to examine one null flag */
+       if (a->isnull1)
+               equal_hasnull = true;
+
         sortKey++;
         for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
         {
-               Datum           datum1,
-                                       datum2;
-               bool            isnull1,
-                                       isnull2;
-
                 datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
                 datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
  
@@ -3313,6 +3591,7 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
         IndexTuple      tuple = (IndexTuple) tup;
         unsigned int tuplen = IndexTupleSize(tuple);
         IndexTuple      newtuple;
+       Datum           original;
  
         /* copy the tuple into sort storage */
         newtuple = (IndexTuple) palloc(tuplen);
@@ -3320,10 +3599,54 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
         USEMEM(state, GetMemoryChunkSpace(newtuple));
         stup->tuple = (void *) newtuple;
         /* set up first-column key value */
-       stup->datum1 = index_getattr(newtuple,
-                                                                1,
-                                                                RelationGetDescr(state->indexRel),
-                                                                &stup->isnull1);
+       original = index_getattr(newtuple,
+                                                        1,
+                                                        RelationGetDescr(state->indexRel),
+                                                        &stup->isnull1);
+
+       if (!state->sortKeys->abbrev_converter || stup->isnull1)
+       {
+               /*
+                * Store ordinary Datum representation, or NULL value.  If there is a
+                * converter it won't expect NULL values, and cost model is not
+                * required to account for NULL, so in that case we avoid calling
+                * converter and just set datum1 to "void" representation (to be
+                * consistent).
+                */
+               stup->datum1 = original;
+       }
+       else if (!consider_abort_common(state))
+       {
+               /* Store abbreviated key representation */
+               stup->datum1 = state->sortKeys->abbrev_converter(original,
+                                                                                                                state->sortKeys);
+       }
+       else
+       {
+               /* Abort abbreviation */
+               int             i;
+
+               stup->datum1 = original;
+
+               /*
+                * Set state to be consistent with never trying abbreviation.
+                *
+                * Alter datum1 representation in already-copied tuples, so as to
+                * ensure a consistent representation (current tuple was just handled).
+                * Note that we rely on all tuples copied so far actually being
+                * contained within memtuples array.
+                */
+               for (i = 0; i < state->memtupcount; i++)
+               {
+                       SortTuple *mtup = &state->memtuples[i];
+
+                       tuple = (IndexTuple) mtup->tuple;
+                       mtup->datum1 = index_getattr(tuple,
+                                                                                1,
+                                                                                RelationGetDescr(state->indexRel),
+                                                                                &stup->isnull1);
+               }
+       }
  }
  
  static void
diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h

new file mode 100644 (file)

index 0000000..a6cbffc
--- /dev/null
+++ b/src/include/lib/hyperloglog.h
@@ -0,0 +1,67 @@
+/*
+ * hyperloglog.h
+ *
+ * A simple HyperLogLog cardinality estimator implementation
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation.  The copyright terms of Ohno's
+ * original version (the MIT license) follow.
+ *
+ * src/include/lib/hyperloglog.h
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef HYPERLOGLOG_H
+#define HYPERLOGLOG_H
+
+/*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set.  Importantly, it does this by using a fixed amount of
+ * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * hyperLogLogState
+ *
+ *             registerWidth           register width, in bits ("k")
+ *             nRegisters                      number of registers
+ *             alphaMM                         alpha * m ^ 2 (see initHyperLogLog())
+ *             hashesArr                       array of hashes
+ *             arrSize                         size of hashesArr
+ */
+typedef struct hyperLogLogState
+{
+       uint8           registerWidth;
+       Size            nRegisters;
+       double          alphaMM;
+       uint8      *hashesArr;
+       Size            arrSize;
+} hyperLogLogState;
+
+extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+extern void    addHyperLogLog(hyperLogLogState *cState, uint32 hash);
+extern double estimateHyperLogLog(hyperLogLogState *cState);
+extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+
+#endif   /* HYPERLOGLOG_H */
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h

index cb35697ad0745d2223feedfbdaba053859e42bb0..5cfc0ae1e80cefb93e991c87889721d5ea33871a 100644 (file)
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -214,13 +214,13 @@
  #endif
  
  /*
- * Assumed cache line size. This doesn't affect correctness, but can be
- * used for low-level optimizations. Currently, this is only used to pad
- * some data structures in xlog.c, to ensure that highly-contended fields
- * are on different cache lines. Too small a value can hurt performance due
- * to false sharing, while the only downside of too large a value is a few
- * bytes of wasted memory. The default is 128, which should be large enough
- * for all supported platforms.
+ * Assumed cache line size. This doesn't affect correctness, but can be used
+ * for low-level optimizations. Currently, this is used to pad some data
+ * structures in xlog.c, to ensure that highly-contended fields are on
+ * different cache lines. Too small a value can hurt performance due to false
+ * sharing, while the only downside of too large a value is a few bytes of
+ * wasted memory. The default is 128, which should be large enough for all
+ * supported platforms.
   */
  #define PG_CACHE_LINE_SIZE             128
  
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h

index f379bb47029a4e3354170f8de744c52b113f3268..62fedfaaad452d1d97db64cffc909457b19c84fb 100644 (file)
--- a/src/include/utils/sortsupport.h
+++ b/src/include/utils/sortsupport.h
@@ -21,7 +21,12 @@
   * required to provide all of them.  The BTSORTSUPPORT function should
   * simply not set any function pointers for mechanisms it doesn't support.
   * Opclasses that provide BTSORTSUPPORT and don't provide a comparator
- * function will have a shim set up by sort support automatically.
+ * function will have a shim set up by sort support automatically.  However,
+ * opclasses that support the optional additional abbreviated key capability
+ * must always provide an authoritative comparator used to tie-break
+ * inconclusive abbreviated comparisons and also used  when aborting
+ * abbreviation.  Furthermore, a converter and abort/costing function must be
+ * provided.
   *
   * All sort support functions will be passed the address of the
   * SortSupportData struct when called, so they can use it to store
@@ -93,12 +98,96 @@ typedef struct SortSupportData
          * than, equal to, or greater than y.  Note that x and y are guaranteed
          * not null, and there is no way to return null either.  Do not return
          * INT_MIN, as callers are allowed to negate the result before using it.
+        *
+        * This may be either the authoritative comparator, or the abbreviated
+        * comparator.  Core code may switch this over the initial preference of an
+        * opclass support function despite originally indicating abbreviation was
+        * applicable, by assigning the authoritative comparator back.
          */
         int                     (*comparator) (Datum x, Datum y, SortSupport ssup);
  
         /*
-        * Additional sort-acceleration functions might be added here later.
+        * "Abbreviated key" infrastructure follows.
+        *
+        * All callbacks must be set by sortsupport opclasses that make use of this
+        * optional additional infrastructure (unless for whatever reasons the
+        * opclass doesn't proceed with abbreviation, in which case
+        * abbrev_converter must not be set).
+        *
+        * This allows opclass authors to supply a conversion routine, used to
+        * create an alternative representation of the underlying type (an
+        * "abbreviated key").  Typically, this representation is an ad-hoc,
+        * pass-by-value Datum format that only the opclass has knowledge of.  An
+        * alternative comparator, used only with this alternative representation
+        * must also be provided (which is assigned to "comparator").  This
+        * representation is a simple approximation of the original Datum.  It must
+        * be possible to compare datums of this representation with each other
+        * using the supplied alternative comparator, and have any non-zero return
+        * value be a reliable proxy for what a proper comparison would indicate.
+        * Returning zero from the alternative comparator does not indicate
+        * equality, as with a conventional support routine 1, though -- it
+        * indicates that it wasn't possible to determine how the two abbreviated
+        * values compared.  A proper comparison, using "abbrev_full_comparator"/
+        * ApplySortAbbrevFullComparator() is therefore required.  In many cases
+        * this results in most or all comparisons only using the cheap alternative
+        * comparison func, which is typically implemented as code that compiles to
+        * just a few CPU instructions.  CPU cache miss penalties are expensive; to
+        * get good overall performance, sort infrastructure must heavily weigh
+        * cache performance.
+        *
+        * Opclass authors must consider the final cardinality of abbreviated keys
+        * when devising an encoding scheme.  It's possible for a strategy to work
+        * better than an alternative strategy with one usage pattern, while the
+        * reverse might be true for another usage pattern.  All of these factors
+        * must be considered.
          */
+
+       /*
+        * "abbreviate" concerns whether or not the abbreviated key optimization is
+        * applicable in principle (that is, the sortsupport routine needs to know
+        * if its dealing with a key where an abbreviated representation can
+        * usefully be packed together.  Conventionally, this is the leading
+        * attribute key).  Note, however, that in order to determine that
+        * abbreviation is not in play, the core code always checks whether or not
+        * the opclass has set abbrev_converter.  This is a one way, one time
+        * message to the opclass.
+        */
+       bool                    abbreviate;
+
+       /*
+        * Converter to abbreviated format, from original representation.  Core
+        * code uses this callback to convert from a pass-by-reference "original"
+        * Datum to a pass-by-value abbreviated key Datum.  Note that original is
+        * guaranteed NOT NULL, because it doesn't make sense to factor NULLness
+        * into ad-hoc cost model.
+        *
+        * abbrev_converter is tested to see if abbreviation is in play.  Core code
+        * may set it to NULL to indicate abbreviation should not be used (which is
+        * something sortsupport routines need not concern themselves with).
+        * However, sortsupport routines must not set it when it is immediately
+        * established that abbreviation should not proceed (for abbreviation
+        * calls, or platform-specific impediments to using abbreviation).
+        */
+       Datum                   (*abbrev_converter) (Datum original, SortSupport ssup);
+
+       /*
+        * abbrev_abort callback allows clients to verify that the current strategy
+        * is working out, using a sortsupport routine defined ad-hoc cost model.
+        * If there is a lot of duplicate abbreviated keys in practice, it's useful
+        * to be able to abandon the strategy before paying too high a cost in
+        * conversion (perhaps certain opclass-specific adaptations are useful
+        * too).
+        */
+       bool                    (*abbrev_abort) (int memtupcount, SortSupport ssup);
+
+       /*
+        * Full, authoritative comparator for key that an abbreviated
+        * representation was generated for, used when an abbreviated comparison
+        * was inconclusive (by calling ApplySortComparatorFull()), or used to
+        * replace "comparator" when core system ultimately decides against
+        * abbreviation.
+        */
+       int                     (*abbrev_full_comparator) (Datum x, Datum y, SortSupport ssup);
  } SortSupportData;
  
  
@@ -110,6 +199,9 @@ typedef struct SortSupportData
  extern int ApplySortComparator(Datum datum1, bool isNull1,
                                         Datum datum2, bool isNull2,
                                         SortSupport ssup);
+extern int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+                                                       Datum datum2, bool isNull2,
+                                                       SortSupport ssup);
  #endif   /* !PG_USE_INLINE */
  #if defined(PG_USE_INLINE) || defined(SORTSUPPORT_INCLUDE_DEFINITIONS)
  /*
@@ -148,6 +240,44 @@ ApplySortComparator(Datum datum1, bool isNull1,
  
         return compare;
  }
+
+/*
+ * Apply a sort comparator function and return a 3-way comparison using full,
+ * authoritative comparator.  This takes care of handling reverse-sort and
+ * NULLs-ordering properly.
+ */
+STATIC_IF_INLINE int
+ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+                                                         Datum datum2, bool isNull2,
+                                                         SortSupport ssup)
+{
+       int                     compare;
+
+       if (isNull1)
+       {
+               if (isNull2)
+                       compare = 0;            /* NULL "=" NULL */
+               else if (ssup->ssup_nulls_first)
+                       compare = -1;           /* NULL "<" NOT_NULL */
+               else
+                       compare = 1;            /* NULL ">" NOT_NULL */
+       }
+       else if (isNull2)
+       {
+               if (ssup->ssup_nulls_first)
+                       compare = 1;            /* NOT_NULL ">" NULL */
+               else
+                       compare = -1;           /* NOT_NULL "<" NULL */
+       }
+       else
+       {
+               compare = (*ssup->abbrev_full_comparator) (datum1, datum2, ssup);
+               if (ssup->ssup_reverse)
+                       compare = -compare;
+       }
+
+       return compare;
+}
  #endif   /*-- PG_USE_INLINE || SORTSUPPORT_INCLUDE_DEFINITIONS */
  
  /* Other functions in utils/sort/sortsupport.c */
author	Robert Haas <rhaas@postgresql.org>
	Mon, 19 Jan 2015 20:20:31 +0000 (15:20 -0500)
committer	Robert Haas <rhaas@postgresql.org>
	Mon, 19 Jan 2015 20:28:27 +0000 (15:28 -0500)
src/backend/access/nbtree/nbtsort.c		patch \| blob \| history
src/backend/commands/analyze.c		patch \| blob \| history
src/backend/executor/nodeAgg.c		patch \| blob \| history
src/backend/executor/nodeMergeAppend.c		patch \| blob \| history
src/backend/executor/nodeMergejoin.c		patch \| blob \| history
src/backend/lib/Makefile		patch \| blob \| history
src/backend/lib/hyperloglog.c	[new file with mode: 0644]	patch \| blob
src/backend/utils/adt/orderedsetaggs.c		patch \| blob \| history
src/backend/utils/adt/varlena.c		patch \| blob \| history
src/backend/utils/sort/tuplesort.c		patch \| blob \| history
src/include/lib/hyperloglog.h	[new file with mode: 0644]	patch \| blob
src/include/pg_config_manual.h		patch \| blob \| history
src/include/utils/sortsupport.h		patch \| blob \| history