Consider secondary factors during nbtree splits.

author Peter Geoghegan <pg@bowt.ie>

Wed, 20 Mar 2019 17:12:19 +0000 (10:12 -0700)

committer Peter Geoghegan <pg@bowt.ie>

Wed, 20 Mar 2019 17:12:19 +0000 (10:12 -0700)
author Peter Geoghegan <pg@bowt.ie>
Wed, 20 Mar 2019 17:12:19 +0000 (10:12 -0700)
committer Peter Geoghegan <pg@bowt.ie>
Wed, 20 Mar 2019 17:12:19 +0000 (10:12 -0700)
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile

index bbb21d235c066a83f84c70bd8d0e6f27059c30e6..9aab9cf64ac819cf5f62349e16465dc77d699f34 100644 (file)
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../../..
  include $(top_builddir)/src/Makefile.global
  
  OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
-       nbtutils.o nbtsort.o nbtvalidate.o nbtxlog.o
+       nbtsplitloc.o nbtutils.o nbtsort.o nbtvalidate.o nbtxlog.o
  
  include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README

index b93b546d225e1661abc3769bcab70b8e77df93c2..266c1cd4cd9f1d652edbd732877a5ded3c7b4151 100644 (file)
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -143,9 +143,9 @@ Lehman and Yao assume fixed-size keys, but we must deal with
  variable-size keys.  Therefore there is not a fixed maximum number of
  keys per page; we just stuff in as many as will fit.  When we split a
  page, we try to equalize the number of bytes, not items, assigned to
-each of the resulting pages.  Note we must include the incoming item in
-this calculation, otherwise it is possible to find that the incoming
-item doesn't fit on the split page where it needs to go!
+pages (though suffix truncation is also considered).  Note we must include
+the incoming item in this calculation, otherwise it is possible to find
+that the incoming item doesn't fit on the split page where it needs to go!
  
  The Deletion Algorithm
  ----------------------
@@ -649,6 +649,47 @@ variable-length types, such as text.  An opclass support function could
  manufacture the shortest possible key value that still correctly separates
  each half of a leaf page split.
  
+There is sophisticated criteria for choosing a leaf page split point.  The
+general idea is to make suffix truncation effective without unduly
+influencing the balance of space for each half of the page split.  The
+choice of leaf split point can be thought of as a choice among points
+*between* items on the page to be split, at least if you pretend that the
+incoming tuple was placed on the page already (you have to pretend because
+there won't actually be enough space for it on the page).  Choosing the
+split point between two index tuples where the first non-equal attribute
+appears as early as possible results in truncating away as many suffix
+attributes as possible.  Evenly balancing space among each half of the
+split is usually the first concern, but even small adjustments in the
+precise split point can allow truncation to be far more effective.
+
+Suffix truncation is primarily valuable because it makes pivot tuples
+smaller, which delays splits of internal pages, but that isn't the only
+reason why it's effective.  Even truncation that doesn't make pivot tuples
+smaller due to alignment still prevents pivot tuples from being more
+restrictive than truly necessary in how they describe which values belong
+on which pages.
+
+While it's not possible to correctly perform suffix truncation during
+internal page splits, it's still useful to be discriminating when splitting
+an internal page.  The split point that implies a downlink be inserted in
+the parent that's the smallest one available within an acceptable range of
+the fillfactor-wise optimal split point is chosen.  This idea also comes
+from the Prefix B-Tree paper.  This process has much in common with what
+happens at the leaf level to make suffix truncation effective.  The overall
+effect is that suffix truncation tends to produce smaller, more
+discriminating pivot tuples, especially early in the lifetime of the index,
+while biasing internal page splits makes the earlier, smaller pivot tuples
+end up in the root page, delaying root page splits.
+
+Logical duplicates are given special consideration.  The logic for
+selecting a split point goes to great lengths to avoid having duplicates
+span more than one page, and almost always manages to pick a split point
+between two user-key-distinct tuples, accepting a completely lopsided split
+if it must.  When a page that's already full of duplicates must be split,
+the fallback strategy assumes that duplicates are mostly inserted in
+ascending heap TID order.  The page is split in a way that leaves the left
+half of the page mostly full, and the right half of the page mostly empty.
+
  Notes About Data Representation
  -------------------------------
  
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c

index b2ee0adb502633ebd084c25946183bbb2bd2d080..2c98405aac889108e62e4b4d6c1f48659953bdf6 100644 (file)
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -28,26 +28,6 @@
  /* Minimum tree height for application of fastpath optimization */
  #define BTREE_FASTPATH_MIN_LEVEL       2
  
-typedef struct
-{
-       /* context data for _bt_checksplitloc */
-       Size            newitemsz;              /* size of new item to be inserted */
-       int                     fillfactor;             /* needed when splitting rightmost page */
-       bool            is_leaf;                /* T if splitting a leaf page */
-       bool            is_rightmost;   /* T if splitting a rightmost page */
-       OffsetNumber newitemoff;        /* where the new item is to be inserted */
-       int                     leftspace;              /* space available for items on left page */
-       int                     rightspace;             /* space available for items on right page */
-       int                     olddataitemstotal;      /* space taken by old items */
-
-       bool            have_split;             /* found a valid split? */
-
-       /* these fields valid only if have_split is true */
-       bool            newitemonleft;  /* new item on left or right of best split */
-       OffsetNumber firstright;        /* best split point */
-       int                     best_delta;             /* best size delta so far */
-} FindSplitData;
-
  
  static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
  
@@ -73,13 +53,6 @@ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
                   Size newitemsz, IndexTuple newitem, bool newitemonleft);
  static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
                                   BTStack stack, bool is_root, bool is_only);
-static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
-                                OffsetNumber newitemoff,
-                                Size newitemsz,
-                                bool *newitemonleft);
-static void _bt_checksplitloc(FindSplitData *state,
-                                 OffsetNumber firstoldonright, bool newitemonleft,
-                                 int dataitemstoleft, Size firstoldonrightsz);
  static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
                          OffsetNumber itup_off);
  static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key,
@@ -1003,7 +976,7 @@ _bt_insertonpg(Relation rel,
  
                 /* Choose the split point */
                 firstright = _bt_findsplitloc(rel, page,
-                                                                         newitemoff, itemsz,
+                                                                         newitemoff, itemsz, itup,
                                                                           &newitemonleft);
  
                 /* split the buffer into left and right halves */
@@ -1687,264 +1660,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
         return rbuf;
  }
  
-/*
- *     _bt_findsplitloc() -- find an appropriate place to split a page.
- *
- * The idea here is to equalize the free space that will be on each split
- * page, *after accounting for the inserted tuple*.  (If we fail to account
- * for it, we might find ourselves with too little room on the page that
- * it needs to go into!)
- *
- * If the page is the rightmost page on its level, we instead try to arrange
- * to leave the left split page fillfactor% full.  In this way, when we are
- * inserting successively increasing keys (consider sequences, timestamps,
- * etc) we will end up with a tree whose pages are about fillfactor% full,
- * instead of the 50% full result that we'd get without this special case.
- * This is the same as nbtsort.c produces for a newly-created tree.  Note
- * that leaf and nonleaf pages use different fillfactors.
- *
- * We are passed the intended insert position of the new tuple, expressed as
- * the offsetnumber of the tuple it must go in front of.  (This could be
- * maxoff+1 if the tuple is to go at the end.)
- *
- * We return the index of the first existing tuple that should go on the
- * righthand page, plus a boolean indicating whether the new tuple goes on
- * the left or right page.  The bool is necessary to disambiguate the case
- * where firstright == newitemoff.
- */
-static OffsetNumber
-_bt_findsplitloc(Relation rel,
-                                Page page,
-                                OffsetNumber newitemoff,
-                                Size newitemsz,
-                                bool *newitemonleft)
-{
-       BTPageOpaque opaque;
-       OffsetNumber offnum;
-       OffsetNumber maxoff;
-       ItemId          itemid;
-       FindSplitData state;
-       int                     leftspace,
-                               rightspace,
-                               goodenough,
-                               olddataitemstotal,
-                               olddataitemstoleft;
-       bool            goodenoughfound;
-
-       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-       /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
-       newitemsz += sizeof(ItemIdData);
-
-       /* Total free space available on a btree page, after fixed overhead */
-       leftspace = rightspace =
-               PageGetPageSize(page) - SizeOfPageHeaderData -
-               MAXALIGN(sizeof(BTPageOpaqueData));
-
-       /* The right page will have the same high key as the old page */
-       if (!P_RIGHTMOST(opaque))
-       {
-               itemid = PageGetItemId(page, P_HIKEY);
-               rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
-                                                        sizeof(ItemIdData));
-       }
-
-       /* Count up total space in data items without actually scanning 'em */
-       olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
-
-       state.newitemsz = newitemsz;
-       state.is_leaf = P_ISLEAF(opaque);
-       state.is_rightmost = P_RIGHTMOST(opaque);
-       state.have_split = false;
-       if (state.is_leaf)
-               state.fillfactor = RelationGetFillFactor(rel,
-                                                                                                BTREE_DEFAULT_FILLFACTOR);
-       else
-               state.fillfactor = BTREE_NONLEAF_FILLFACTOR;
-       state.newitemonleft = false;    /* these just to keep compiler quiet */
-       state.firstright = 0;
-       state.best_delta = 0;
-       state.leftspace = leftspace;
-       state.rightspace = rightspace;
-       state.olddataitemstotal = olddataitemstotal;
-       state.newitemoff = newitemoff;
-
-       /*
-        * Finding the best possible split would require checking all the possible
-        * split points, because of the high-key and left-key special cases.
-        * That's probably more work than it's worth; instead, stop as soon as we
-        * find a "good-enough" split, where good-enough is defined as an
-        * imbalance in free space of no more than pagesize/16 (arbitrary...) This
-        * should let us stop near the middle on most pages, instead of plowing to
-        * the end.
-        */
-       goodenough = leftspace / 16;
-
-       /*
-        * Scan through the data items and calculate space usage for a split at
-        * each possible position.
-        */
-       olddataitemstoleft = 0;
-       goodenoughfound = false;
-       maxoff = PageGetMaxOffsetNumber(page);
-
-       for (offnum = P_FIRSTDATAKEY(opaque);
-                offnum <= maxoff;
-                offnum = OffsetNumberNext(offnum))
-       {
-               Size            itemsz;
-
-               itemid = PageGetItemId(page, offnum);
-               itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
-
-               /*
-                * Will the new item go to left or right of split?
-                */
-               if (offnum > newitemoff)
-                       _bt_checksplitloc(&state, offnum, true,
-                                                         olddataitemstoleft, itemsz);
-
-               else if (offnum < newitemoff)
-                       _bt_checksplitloc(&state, offnum, false,
-                                                         olddataitemstoleft, itemsz);
-               else
-               {
-                       /* need to try it both ways! */
-                       _bt_checksplitloc(&state, offnum, true,
-                                                         olddataitemstoleft, itemsz);
-
-                       _bt_checksplitloc(&state, offnum, false,
-                                                         olddataitemstoleft, itemsz);
-               }
-
-               /* Abort scan once we find a good-enough choice */
-               if (state.have_split && state.best_delta <= goodenough)
-               {
-                       goodenoughfound = true;
-                       break;
-               }
-
-               olddataitemstoleft += itemsz;
-       }
-
-       /*
-        * If the new item goes as the last item, check for splitting so that all
-        * the old items go to the left page and the new item goes to the right
-        * page.
-        */
-       if (newitemoff > maxoff && !goodenoughfound)
-               _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
-
-       /*
-        * I believe it is not possible to fail to find a feasible split, but just
-        * in case ...
-        */
-       if (!state.have_split)
-               elog(ERROR, "could not find a feasible split point for index \"%s\"",
-                        RelationGetRelationName(rel));
-
-       *newitemonleft = state.newitemonleft;
-       return state.firstright;
-}
-
-/*
- * Subroutine to analyze a particular possible split choice (ie, firstright
- * and newitemonleft settings), and record the best split so far in *state.
- *
- * firstoldonright is the offset of the first item on the original page
- * that goes to the right page, and firstoldonrightsz is the size of that
- * tuple. firstoldonright can be > max offset, which means that all the old
- * items go to the left page and only the new item goes to the right page.
- * In that case, firstoldonrightsz is not used.
- *
- * olddataitemstoleft is the total size of all old items to the left of
- * firstoldonright.
- */
-static void
-_bt_checksplitloc(FindSplitData *state,
-                                 OffsetNumber firstoldonright,
-                                 bool newitemonleft,
-                                 int olddataitemstoleft,
-                                 Size firstoldonrightsz)
-{
-       int                     leftfree,
-                               rightfree;
-       Size            firstrightitemsz;
-       bool            newitemisfirstonright;
-
-       /* Is the new item going to be the first item on the right page? */
-       newitemisfirstonright = (firstoldonright == state->newitemoff
-                                                        && !newitemonleft);
-
-       if (newitemisfirstonright)
-               firstrightitemsz = state->newitemsz;
-       else
-               firstrightitemsz = firstoldonrightsz;
-
-       /* Account for all the old tuples */
-       leftfree = state->leftspace - olddataitemstoleft;
-       rightfree = state->rightspace -
-               (state->olddataitemstotal - olddataitemstoleft);
-
-       /*
-        * The first item on the right page becomes the high key of the left page;
-        * therefore it counts against left space as well as right space. When
-        * index has included attributes, then those attributes of left page high
-        * key will be truncated leaving that page with slightly more free space.
-        * However, that shouldn't affect our ability to find valid split
-        * location, because anyway split location should exists even without high
-        * key truncation.
-        */
-       leftfree -= firstrightitemsz;
-
-       /* account for the new item */
-       if (newitemonleft)
-               leftfree -= (int) state->newitemsz;
-       else
-               rightfree -= (int) state->newitemsz;
-
-       /*
-        * If we are not on the leaf level, we will be able to discard the key
-        * data from the first item that winds up on the right page.
-        */
-       if (!state->is_leaf)
-               rightfree += (int) firstrightitemsz -
-                       (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
-
-       /*
-        * If feasible split point, remember best delta.
-        */
-       if (leftfree >= 0 && rightfree >= 0)
-       {
-               int                     delta;
-
-               if (state->is_rightmost)
-               {
-                       /*
-                        * If splitting a rightmost page, try to put (100-fillfactor)% of
-                        * free space on left page. See comments for _bt_findsplitloc.
-                        */
-                       delta = (state->fillfactor * leftfree)
-                               - ((100 - state->fillfactor) * rightfree);
-               }
-               else
-               {
-                       /* Otherwise, aim for equal free space on both sides */
-                       delta = leftfree - rightfree;
-               }
-
-               if (delta < 0)
-                       delta = -delta;
-               if (!state->have_split || delta < state->best_delta)
-               {
-                       state->have_split = true;
-                       state->newitemonleft = newitemonleft;
-                       state->firstright = firstoldonright;
-                       state->best_delta = delta;
-               }
-       }
-}
-
  /*
   * _bt_insert_parent() -- Insert downlink into parent after a page split.
   *
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c

new file mode 100644 (file)

index 0000000..34228b1
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -0,0 +1,846 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtsplitloc.c
+ *       Choose split point code for Postgres btree implementation.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       src/backend/access/nbtree/nbtsplitloc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "storage/lmgr.h"
+
+/* limits on split interval (default strategy only) */
+#define MAX_LEAF_INTERVAL                      9
+#define MAX_INTERNAL_INTERVAL          18
+
+typedef enum
+{
+       /* strategy for searching through materialized list of split points */
+       SPLIT_DEFAULT,                          /* give some weight to truncation */
+       SPLIT_MANY_DUPLICATES,          /* find minimally distinguishing point */
+       SPLIT_SINGLE_VALUE                      /* leave left page almost full */
+} FindSplitStrat;
+
+typedef struct
+{
+       /* details of free space left by split */
+       int16           curdelta;               /* current leftfree/rightfree delta */
+       int16           leftfree;               /* space left on left page post-split */
+       int16           rightfree;              /* space left on right page post-split */
+
+       /* split point identifying fields (returned by _bt_findsplitloc) */
+       OffsetNumber firstoldonright;   /* first item on new right page */
+       bool            newitemonleft;  /* new item goes on left, or right? */
+
+} SplitPoint;
+
+typedef struct
+{
+       /* context data for _bt_recsplitloc */
+       Relation        rel;                    /* index relation */
+       Page            page;                   /* page undergoing split */
+       IndexTuple      newitem;                /* new item (cause of page split) */
+       Size            newitemsz;              /* size of newitem (includes line pointer) */
+       bool            is_leaf;                /* T if splitting a leaf page */
+       bool            is_rightmost;   /* T if splitting rightmost page on level */
+       OffsetNumber newitemoff;        /* where the new item is to be inserted */
+       int                     leftspace;              /* space available for items on left page */
+       int                     rightspace;             /* space available for items on right page */
+       int                     olddataitemstotal;      /* space taken by old items */
+       Size            minfirstrightsz;        /* smallest firstoldonright tuple size */
+
+       /* candidate split point data */
+       int                     maxsplits;              /* maximum number of splits */
+       int                     nsplits;                /* current number of splits */
+       SplitPoint *splits;                     /* all candidate split points for page */
+       int                     interval;               /* current range of acceptable split points */
+} FindSplitData;
+
+static void _bt_recsplitloc(FindSplitData *state,
+                               OffsetNumber firstoldonright, bool newitemonleft,
+                               int olddataitemstoleft, Size firstoldonrightsz);
+static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+                                       bool usemult);
+static int     _bt_splitcmp(const void *arg1, const void *arg2);
+static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+                                bool *newitemonleft);
+static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+                        SplitPoint *rightpage, FindSplitStrat *strategy);
+static void _bt_interval_edges(FindSplitData *state,
+                                  SplitPoint **leftinterval, SplitPoint **rightinterval);
+static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
+static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
+                                  SplitPoint *split);
+static inline IndexTuple _bt_split_firstright(FindSplitData *state,
+                                        SplitPoint *split);
+
+
+/*
+ *     _bt_findsplitloc() -- find an appropriate place to split a page.
+ *
+ * The main goal here is to equalize the free space that will be on each
+ * split page, *after accounting for the inserted tuple*.  (If we fail to
+ * account for it, we might find ourselves with too little room on the page
+ * that it needs to go into!)
+ *
+ * If the page is the rightmost page on its level, we instead try to arrange
+ * to leave the left split page fillfactor% full.  In this way, when we are
+ * inserting successively increasing keys (consider sequences, timestamps,
+ * etc) we will end up with a tree whose pages are about fillfactor% full,
+ * instead of the 50% full result that we'd get without this special case.
+ * This is the same as nbtsort.c produces for a newly-created tree.  Note
+ * that leaf and nonleaf pages use different fillfactors.  Note also that
+ * there are a number of further special cases where fillfactor is not
+ * applied in the standard way.
+ *
+ * We are passed the intended insert position of the new tuple, expressed as
+ * the offsetnumber of the tuple it must go in front of (this could be
+ * maxoff+1 if the tuple is to go at the end).  The new tuple itself is also
+ * passed, since it's needed to give some weight to how effective suffix
+ * truncation will be.  The implementation picks the split point that
+ * maximizes the effectiveness of suffix truncation from a small list of
+ * alternative candidate split points that leave each side of the split with
+ * about the same share of free space.  Suffix truncation is secondary to
+ * equalizing free space, except in cases with large numbers of duplicates.
+ * Note that it is always assumed that caller goes on to perform truncation,
+ * even with pg_upgrade'd indexes where that isn't actually the case
+ * (!heapkeyspace indexes).  See nbtree/README for more information about
+ * suffix truncation.
+ *
+ * We return the index of the first existing tuple that should go on the
+ * righthand page, plus a boolean indicating whether the new tuple goes on
+ * the left or right page.  The bool is necessary to disambiguate the case
+ * where firstright == newitemoff.
+ */
+OffsetNumber
+_bt_findsplitloc(Relation rel,
+                                Page page,
+                                OffsetNumber newitemoff,
+                                Size newitemsz,
+                                IndexTuple newitem,
+                                bool *newitemonleft)
+{
+       BTPageOpaque opaque;
+       int                     leftspace,
+                               rightspace,
+                               olddataitemstotal,
+                               olddataitemstoleft,
+                               perfectpenalty,
+                               leaffillfactor;
+       FindSplitData state;
+       FindSplitStrat strategy;
+       ItemId          itemid;
+       OffsetNumber offnum,
+                               maxoff,
+                               foundfirstright;
+       double          fillfactormult;
+       bool            usemult;
+       SplitPoint      leftpage,
+                               rightpage;
+
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       maxoff = PageGetMaxOffsetNumber(page);
+
+       /* Total free space available on a btree page, after fixed overhead */
+       leftspace = rightspace =
+               PageGetPageSize(page) - SizeOfPageHeaderData -
+               MAXALIGN(sizeof(BTPageOpaqueData));
+
+       /* The right page will have the same high key as the old page */
+       if (!P_RIGHTMOST(opaque))
+       {
+               itemid = PageGetItemId(page, P_HIKEY);
+               rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
+                                                        sizeof(ItemIdData));
+       }
+
+       /* Count up total space in data items before actually scanning 'em */
+       olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
+       leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
+
+       /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+       newitemsz += sizeof(ItemIdData);
+       state.rel = rel;
+       state.page = page;
+       state.newitem = newitem;
+       state.newitemsz = newitemsz;
+       state.is_leaf = P_ISLEAF(opaque);
+       state.is_rightmost = P_RIGHTMOST(opaque);
+       state.leftspace = leftspace;
+       state.rightspace = rightspace;
+       state.olddataitemstotal = olddataitemstotal;
+       state.minfirstrightsz = SIZE_MAX;
+       state.newitemoff = newitemoff;
+
+       /*
+        * maxsplits should never exceed maxoff because there will be at most as
+        * many candidate split points as there are points _between_ tuples, once
+        * you imagine that the new item is already on the original page (the
+        * final number of splits may be slightly lower because not all points
+        * between tuples will be legal).
+        */
+       state.maxsplits = maxoff;
+       state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
+       state.nsplits = 0;
+
+       /*
+        * Scan through the data items and calculate space usage for a split at
+        * each possible position.  We start at the first data offset rather than
+        * the second data offset to handle the "newitemoff == first data offset"
+        * case (any other split whose firstoldonright is the first data offset
+        * can't be legal, though, and so won't actually end up being recorded in
+        * first loop iteration).
+        */
+       olddataitemstoleft = 0;
+
+       for (offnum = P_FIRSTDATAKEY(opaque);
+                offnum <= maxoff;
+                offnum = OffsetNumberNext(offnum))
+       {
+               Size            itemsz;
+
+               itemid = PageGetItemId(page, offnum);
+               itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+
+               /*
+                * Will the new item go to left or right of split?
+                */
+               if (offnum > newitemoff)
+                       _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+               else if (offnum < newitemoff)
+                       _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+               else
+               {
+                       /* may need to record a split on one or both sides of new item */
+                       _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
+                       _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
+               }
+
+               olddataitemstoleft += itemsz;
+       }
+
+       /*
+        * If the new item goes as the last item, record the split point that
+        * leaves all the old items on the left page, and the new item on the
+        * right page.  This is required because a split that leaves the new item
+        * as the firstoldonright won't have been reached within the loop.
+        */
+       Assert(olddataitemstoleft == olddataitemstotal);
+       if (newitemoff > maxoff)
+               _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
+
+       /*
+        * I believe it is not possible to fail to find a feasible split, but just
+        * in case ...
+        */
+       if (state.nsplits == 0)
+               elog(ERROR, "could not find a feasible split point for index \"%s\"",
+                        RelationGetRelationName(rel));
+
+       /*
+        * Start search for a split point among list of legal split points.  Give
+        * primary consideration to equalizing available free space in each half
+        * of the split initially (start with default strategy), while applying
+        * rightmost optimization where appropriate.  Either of the two other
+        * fallback strategies may be required for cases with a large number of
+        * duplicates around the original/space-optimal split point.
+        *
+        * Default strategy gives some weight to suffix truncation in deciding a
+        * split point on leaf pages.  It attempts to select a split point where a
+        * distinguishing attribute appears earlier in the new high key for the
+        * left side of the split, in order to maximize the number of trailing
+        * attributes that can be truncated away.  Only candidate split points
+        * that imply an acceptable balance of free space on each side are
+        * considered.
+        */
+       if (!state.is_leaf)
+       {
+               /* fillfactormult only used on rightmost page */
+               usemult = state.is_rightmost;
+               fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
+       }
+       else if (state.is_rightmost)
+       {
+               /* Rightmost leaf page --  fillfactormult always used */
+               usemult = true;
+               fillfactormult = leaffillfactor / 100.0;
+       }
+       else
+       {
+               /* Other leaf page.  50:50 page split. */
+               usemult = false;
+               /* fillfactormult not used, but be tidy */
+               fillfactormult = 0.50;
+       }
+
+       /*
+        * Set an initial limit on the split interval/number of candidate split
+        * points as appropriate.  The "Prefix B-Trees" paper refers to this as
+        * sigma l for leaf splits and sigma b for internal ("branch") splits.
+        * It's hard to provide a theoretical justification for the initial size
+        * of the split interval, though it's clear that a small split interval
+        * makes suffix truncation much more effective without noticeably
+        * affecting space utilization over time.
+        */
+       state.interval = Min(Max(1, state.nsplits * 0.05),
+                                                state.is_leaf ? MAX_LEAF_INTERVAL :
+                                                MAX_INTERNAL_INTERVAL);
+
+       /*
+        * Save leftmost and rightmost splits for page before original ordinal
+        * sort order is lost by delta/fillfactormult sort
+        */
+       leftpage = state.splits[0];
+       rightpage = state.splits[state.nsplits - 1];
+
+       /* Give split points a fillfactormult-wise delta, and sort on deltas */
+       _bt_deltasortsplits(&state, fillfactormult, usemult);
+
+       /*
+        * Determine if default strategy/split interval will produce a
+        * sufficiently distinguishing split, or if we should change strategies.
+        * Alternative strategies change the range of split points that are
+        * considered acceptable (split interval), and possibly change
+        * fillfactormult, in order to deal with pages with a large number of
+        * duplicates gracefully.
+        *
+        * Pass low and high splits for the entire page (including even newitem).
+        * These are used when the initial split interval encloses split points
+        * that are full of duplicates, and we need to consider if it's even
+        * possible to avoid appending a heap TID.
+        */
+       perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
+
+       if (strategy == SPLIT_DEFAULT)
+       {
+               /*
+                * Default strategy worked out (always works out with internal page).
+                * Original split interval still stands.
+                */
+       }
+
+       /*
+        * Many duplicates strategy is used when a heap TID would otherwise be
+        * appended, but the page isn't completely full of logical duplicates.
+        *
+        * The split interval is widened to include all legal candidate split
+        * points.  There may be a few as two distinct values in the whole-page
+        * split interval.  Many duplicates strategy has no hard requirements for
+        * space utilization, though it still keeps the use of space balanced as a
+        * non-binding secondary goal (perfect penalty is set so that the
+        * first/lowest delta split points that avoids appending a heap TID is
+        * used).
+        *
+        * Single value strategy is used when it is impossible to avoid appending
+        * a heap TID.  It arranges to leave the left page very full.  This
+        * maximizes space utilization in cases where tuples with the same
+        * attribute values span many pages.  Newly inserted duplicates will tend
+        * to have higher heap TID values, so we'll end up splitting to the right
+        * consistently.  (Single value strategy is harmless though not
+        * particularly useful with !heapkeyspace indexes.)
+        */
+       else if (strategy == SPLIT_MANY_DUPLICATES)
+       {
+               Assert(state.is_leaf);
+               /* No need to resort splits -- no change in fillfactormult/deltas */
+               state.interval = state.nsplits;
+       }
+       else if (strategy == SPLIT_SINGLE_VALUE)
+       {
+               Assert(state.is_leaf);
+               /* Split near the end of the page */
+               usemult = true;
+               fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
+               /* Resort split points with new delta */
+               _bt_deltasortsplits(&state, fillfactormult, usemult);
+               /* Appending a heap TID is unavoidable, so interval of 1 is fine */
+               state.interval = 1;
+       }
+
+       /*
+        * Search among acceptable split points (using final split interval) for
+        * the entry that has the lowest penalty, and is therefore expected to
+        * maximize fan-out.  Sets *newitemonleft for us.
+        */
+       foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft);
+       pfree(state.splits);
+
+       return foundfirstright;
+}
+
+/*
+ * Subroutine to record a particular point between two tuples (possibly the
+ * new item) on page (ie, combination of firstright and newitemonleft
+ * settings) in *state for later analysis.  This is also a convenient point
+ * to check if the split is legal (if it isn't, it won't be recorded).
+ *
+ * firstoldonright is the offset of the first item on the original page that
+ * goes to the right page, and firstoldonrightsz is the size of that tuple.
+ * firstoldonright can be > max offset, which means that all the old items go
+ * to the left page and only the new item goes to the right page.  In that
+ * case, firstoldonrightsz is not used.
+ *
+ * olddataitemstoleft is the total size of all old items to the left of the
+ * split point that is recorded here when legal.  Should not include
+ * newitemsz, since that is handled here.
+ */
+static void
+_bt_recsplitloc(FindSplitData *state,
+                               OffsetNumber firstoldonright,
+                               bool newitemonleft,
+                               int olddataitemstoleft,
+                               Size firstoldonrightsz)
+{
+       int16           leftfree,
+                               rightfree;
+       Size            firstrightitemsz;
+       bool            newitemisfirstonright;
+
+       /* Is the new item going to be the first item on the right page? */
+       newitemisfirstonright = (firstoldonright == state->newitemoff
+                                                        && !newitemonleft);
+
+       if (newitemisfirstonright)
+               firstrightitemsz = state->newitemsz;
+       else
+               firstrightitemsz = firstoldonrightsz;
+
+       /* Account for all the old tuples */
+       leftfree = state->leftspace - olddataitemstoleft;
+       rightfree = state->rightspace -
+               (state->olddataitemstotal - olddataitemstoleft);
+
+       /*
+        * The first item on the right page becomes the high key of the left page;
+        * therefore it counts against left space as well as right space (we
+        * cannot assume that suffix truncation will make it any smaller).  When
+        * index has included attributes, then those attributes of left page high
+        * key will be truncated leaving that page with slightly more free space.
+        * However, that shouldn't affect our ability to find valid split
+        * location, since we err in the direction of being pessimistic about free
+        * space on the left half.  Besides, even when suffix truncation of
+        * non-TID attributes occurs, the new high key often won't even be a
+        * single MAXALIGN() quantum smaller than the firstright tuple it's based
+        * on.
+        *
+        * If we are on the leaf level, assume that suffix truncation cannot avoid
+        * adding a heap TID to the left half's new high key when splitting at the
+        * leaf level.  In practice the new high key will often be smaller and
+        * will rarely be larger, but conservatively assume the worst case.
+        */
+       if (state->is_leaf)
+               leftfree -= (int16) (firstrightitemsz +
+                                                        MAXALIGN(sizeof(ItemPointerData)));
+       else
+               leftfree -= (int16) firstrightitemsz;
+
+       /* account for the new item */
+       if (newitemonleft)
+               leftfree -= (int16) state->newitemsz;
+       else
+               rightfree -= (int16) state->newitemsz;
+
+       /*
+        * If we are not on the leaf level, we will be able to discard the key
+        * data from the first item that winds up on the right page.
+        */
+       if (!state->is_leaf)
+               rightfree += (int16) firstrightitemsz -
+                       (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
+
+       /* Record split if legal */
+       if (leftfree >= 0 && rightfree >= 0)
+       {
+               Assert(state->nsplits < state->maxsplits);
+
+               /* Determine smallest firstright item size on page */
+               state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz);
+
+               state->splits[state->nsplits].curdelta = 0;
+               state->splits[state->nsplits].leftfree = leftfree;
+               state->splits[state->nsplits].rightfree = rightfree;
+               state->splits[state->nsplits].firstoldonright = firstoldonright;
+               state->splits[state->nsplits].newitemonleft = newitemonleft;
+               state->nsplits++;
+       }
+}
+
+/*
+ * Subroutine to assign space deltas to materialized array of candidate split
+ * points based on current fillfactor, and to sort array using that fillfactor
+ */
+static void
+_bt_deltasortsplits(FindSplitData *state, double fillfactormult,
+                                       bool usemult)
+{
+       for (int i = 0; i < state->nsplits; i++)
+       {
+               SplitPoint *split = state->splits + i;
+               int16           delta;
+
+               if (usemult)
+                       delta = fillfactormult * split->leftfree -
+                               (1.0 - fillfactormult) * split->rightfree;
+               else
+                       delta = split->leftfree - split->rightfree;
+
+               if (delta < 0)
+                       delta = -delta;
+
+               /* Save delta */
+               split->curdelta = delta;
+       }
+
+       qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
+}
+
+/*
+ * qsort-style comparator used by _bt_deltasortsplits()
+ */
+static int
+_bt_splitcmp(const void *arg1, const void *arg2)
+{
+       SplitPoint *split1 = (SplitPoint *) arg1;
+       SplitPoint *split2 = (SplitPoint *) arg2;
+
+       if (split1->curdelta > split2->curdelta)
+               return 1;
+       if (split1->curdelta < split2->curdelta)
+               return -1;
+
+       return 0;
+}
+
+/*
+ * Subroutine to find the "best" split point among an array of acceptable
+ * candidate split points that split without there being an excessively high
+ * delta between the space left free on the left and right halves.  The "best"
+ * split point is the split point with the lowest penalty among split points
+ * that fall within current/final split interval.  Penalty is an abstract
+ * score, with a definition that varies depending on whether we're splitting a
+ * leaf page or an internal page.  See _bt_split_penalty() for details.
+ *
+ * "perfectpenalty" is assumed to be the lowest possible penalty among
+ * candidate split points.  This allows us to return early without wasting
+ * cycles on calculating the first differing attribute for all candidate
+ * splits when that clearly cannot improve our choice (or when we only want a
+ * minimally distinguishing split point, and don't want to make the split any
+ * more unbalanced than is necessary).
+ *
+ * We return the index of the first existing tuple that should go on the right
+ * page, plus a boolean indicating if new item is on left of split point.
+ */
+static OffsetNumber
+_bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft)
+{
+       int                     bestpenalty,
+                               lowsplit;
+       int                     highsplit = Min(state->interval, state->nsplits);
+
+       /* No point in calculating penalty when there's only one choice */
+       if (state->nsplits == 1)
+       {
+               *newitemonleft = state->splits[0].newitemonleft;
+               return state->splits[0].firstoldonright;
+       }
+
+       bestpenalty = INT_MAX;
+       lowsplit = 0;
+       for (int i = lowsplit; i < highsplit; i++)
+       {
+               int                     penalty;
+
+               penalty = _bt_split_penalty(state, state->splits + i);
+
+               if (penalty <= perfectpenalty)
+               {
+                       bestpenalty = penalty;
+                       lowsplit = i;
+                       break;
+               }
+
+               if (penalty < bestpenalty)
+               {
+                       bestpenalty = penalty;
+                       lowsplit = i;
+               }
+       }
+
+       *newitemonleft = state->splits[lowsplit].newitemonleft;
+       return state->splits[lowsplit].firstoldonright;
+}
+
+/*
+ * Subroutine to decide whether split should use default strategy/initial
+ * split interval, or whether it should finish splitting the page using
+ * alternative strategies (this is only possible with leaf pages).
+ *
+ * Caller uses alternative strategy (or sticks with default strategy) based
+ * on how *strategy is set here.  Return value is "perfect penalty", which is
+ * passed to _bt_bestsplitloc() as a final constraint on how far caller is
+ * willing to go to avoid appending a heap TID when using the many duplicates
+ * strategy (it also saves _bt_bestsplitloc() useless cycles).
+ */
+static int
+_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
+                        SplitPoint *rightpage, FindSplitStrat *strategy)
+{
+       IndexTuple      leftmost,
+                               rightmost;
+       SplitPoint *leftinterval,
+                          *rightinterval;
+       int                     perfectpenalty;
+       int                     indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+       /* Assume that alternative strategy won't be used for now */
+       *strategy = SPLIT_DEFAULT;
+
+       /*
+        * Use smallest observed first right item size for entire page as perfect
+        * penalty on internal pages.  This can save cycles in the common case
+        * where most or all splits (not just splits within interval) have first
+        * right tuples that are the same size.
+        */
+       if (!state->is_leaf)
+               return state->minfirstrightsz;
+
+       /*
+        * Use leftmost and rightmost tuples from leftmost and rightmost splits in
+        * current split interval
+        */
+       _bt_interval_edges(state, &leftinterval, &rightinterval);
+       leftmost = _bt_split_lastleft(state, leftinterval);
+       rightmost = _bt_split_firstright(state, rightinterval);
+
+       /*
+        * If initial split interval can produce a split point that will at least
+        * avoid appending a heap TID in new high key, we're done.  Finish split
+        * with default strategy and initial split interval.
+        */
+       perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+       if (perfectpenalty <= indnkeyatts)
+               return perfectpenalty;
+
+       /*
+        * Work out how caller should finish split when even their "perfect"
+        * penalty for initial/default split interval indicates that the interval
+        * does not contain even a single split that avoids appending a heap TID.
+        *
+        * Use the leftmost split's lastleft tuple and the rightmost split's
+        * firstright tuple to assess every possible split.
+        */
+       leftmost = _bt_split_lastleft(state, leftpage);
+       rightmost = _bt_split_firstright(state, rightpage);
+
+       /*
+        * If page (including new item) has many duplicates but is not entirely
+        * full of duplicates, a many duplicates strategy split will be performed.
+        * If page is entirely full of duplicates, a single value strategy split
+        * will be performed.
+        */
+       perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
+       if (perfectpenalty <= indnkeyatts)
+       {
+               *strategy = SPLIT_MANY_DUPLICATES;
+
+               /*
+                * Caller should choose the lowest delta split that avoids appending a
+                * heap TID.  Maximizing the number of attributes that can be
+                * truncated away (returning perfectpenalty when it happens to be less
+                * than the number of key attributes in index) can result in continual
+                * unbalanced page splits.
+                *
+                * Just avoiding appending a heap TID can still make splits very
+                * unbalanced, but this is self-limiting.  When final split has a very
+                * high delta, one side of the split will likely consist of a single
+                * value.  If that page is split once again, then that split will
+                * likely use the single value strategy.
+                */
+               return indnkeyatts;
+       }
+
+       /*
+        * Single value strategy is only appropriate with ever-increasing heap
+        * TIDs; otherwise, original default strategy split should proceed to
+        * avoid pathological performance.  Use page high key to infer if this is
+        * the rightmost page among pages that store the same duplicate value.
+        * This should not prevent insertions of heap TIDs that are slightly out
+        * of order from using single value strategy, since that's expected with
+        * concurrent inserters of the same duplicate value.
+        */
+       else if (state->is_rightmost)
+               *strategy = SPLIT_SINGLE_VALUE;
+       else
+       {
+               ItemId          itemid;
+               IndexTuple      hikey;
+
+               itemid = PageGetItemId(state->page, P_HIKEY);
+               hikey = (IndexTuple) PageGetItem(state->page, itemid);
+               perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
+                                                                                        state->newitem);
+               if (perfectpenalty <= indnkeyatts)
+                       *strategy = SPLIT_SINGLE_VALUE;
+               else
+               {
+                       /*
+                        * Have caller finish split using default strategy, since page
+                        * does not appear to be the rightmost page for duplicates of the
+                        * value the page is filled with
+                        */
+               }
+       }
+
+       return perfectpenalty;
+}
+
+/*
+ * Subroutine to locate leftmost and rightmost splits for current/default
+ * split interval.  Note that it will be the same split iff there is only one
+ * split in interval.
+ */
+static void
+_bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
+                                  SplitPoint **rightinterval)
+{
+       int                     highsplit = Min(state->interval, state->nsplits);
+       SplitPoint *deltaoptimal;
+
+       deltaoptimal = state->splits;
+       *leftinterval = NULL;
+       *rightinterval = NULL;
+
+       /*
+        * Delta is an absolute distance to optimal split point, so both the
+        * leftmost and rightmost split point will usually be at the end of the
+        * array
+        */
+       for (int i = highsplit - 1; i >= 0; i--)
+       {
+               SplitPoint *distant = state->splits + i;
+
+               if (distant->firstoldonright < deltaoptimal->firstoldonright)
+               {
+                       if (*leftinterval == NULL)
+                               *leftinterval = distant;
+               }
+               else if (distant->firstoldonright > deltaoptimal->firstoldonright)
+               {
+                       if (*rightinterval == NULL)
+                               *rightinterval = distant;
+               }
+               else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
+               {
+                       /*
+                        * "incoming tuple will become first on right page" (distant) is
+                        * to the left of "incoming tuple will become last on left page"
+                        * (delta-optimal)
+                        */
+                       Assert(distant->firstoldonright == state->newitemoff);
+                       if (*leftinterval == NULL)
+                               *leftinterval = distant;
+               }
+               else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
+               {
+                       /*
+                        * "incoming tuple will become last on left page" (distant) is to
+                        * the right of "incoming tuple will become first on right page"
+                        * (delta-optimal)
+                        */
+                       Assert(distant->firstoldonright == state->newitemoff);
+                       if (*rightinterval == NULL)
+                               *rightinterval = distant;
+               }
+               else
+               {
+                       /* There was only one or two splits in initial split interval */
+                       Assert(distant == deltaoptimal);
+                       if (*leftinterval == NULL)
+                               *leftinterval = distant;
+                       if (*rightinterval == NULL)
+                               *rightinterval = distant;
+               }
+
+               if (*leftinterval && *rightinterval)
+                       return;
+       }
+
+       Assert(false);
+}
+
+/*
+ * Subroutine to find penalty for caller's candidate split point.
+ *
+ * On leaf pages, penalty is the attribute number that distinguishes each side
+ * of a split.  It's the last attribute that needs to be included in new high
+ * key for left page.  It can be greater than the number of key attributes in
+ * cases where a heap TID will need to be appended during truncation.
+ *
+ * On internal pages, penalty is simply the size of the first item on the
+ * right half of the split (including line pointer overhead).  This tuple will
+ * become the new high key for the left page.
+ */
+static inline int
+_bt_split_penalty(FindSplitData *state, SplitPoint *split)
+{
+       IndexTuple      lastleftuple;
+       IndexTuple      firstrighttuple;
+
+       if (!state->is_leaf)
+       {
+               ItemId          itemid;
+
+               if (!split->newitemonleft &&
+                       split->firstoldonright == state->newitemoff)
+                       return state->newitemsz;
+
+               itemid = PageGetItemId(state->page, split->firstoldonright);
+
+               return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+       }
+
+       lastleftuple = _bt_split_lastleft(state, split);
+       firstrighttuple = _bt_split_firstright(state, split);
+
+       Assert(lastleftuple != firstrighttuple);
+       return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple);
+}
+
+/*
+ * Subroutine to get a lastleft IndexTuple for a spit point from page
+ */
+static inline IndexTuple
+_bt_split_lastleft(FindSplitData *state, SplitPoint *split)
+{
+       ItemId          itemid;
+
+       if (split->newitemonleft && split->firstoldonright == state->newitemoff)
+               return state->newitem;
+
+       itemid = PageGetItemId(state->page,
+                                                  OffsetNumberPrev(split->firstoldonright));
+       return (IndexTuple) PageGetItem(state->page, itemid);
+}
+
+/*
+ * Subroutine to get a firstright IndexTuple for a spit point from page
+ */
+static inline IndexTuple
+_bt_split_firstright(FindSplitData *state, SplitPoint *split)
+{
+       ItemId          itemid;
+
+       if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
+               return state->newitem;
+
+       itemid = PageGetItemId(state->page, split->firstoldonright);
+       return (IndexTuple) PageGetItem(state->page, itemid);
+}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c

index 2f9f6e7601591a30039289e8c451bd25e77cd71f..6b59e16c4d5dc383a2cc1bba1d18db45414df807 100644 (file)
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -22,6 +22,7 @@
  #include "access/relscan.h"
  #include "miscadmin.h"
  #include "utils/array.h"
+#include "utils/datum.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
  #include "utils/rel.h"
@@ -2295,6 +2296,60 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
         return keepnatts;
  }
  
+/*
+ * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
+ *
+ * This is exported so that a candidate split point can have its effect on
+ * suffix truncation inexpensively evaluated ahead of time when finding a
+ * split location.  A naive bitwise approach to datum comparisons is used to
+ * save cycles.
+ *
+ * The approach taken here usually provides the same answer as _bt_keep_natts
+ * will (for the same pair of tuples from a heapkeyspace index), since the
+ * majority of btree opclasses can never indicate that two datums are equal
+ * unless they're bitwise equal (once detoasted).  Similarly, result may
+ * differ from the _bt_keep_natts result when either tuple has TOASTed datums,
+ * though this is barely possible in practice.
+ *
+ * These issues must be acceptable to callers, typically because they're only
+ * concerned about making suffix truncation as effective as possible without
+ * leaving excessive amounts of free space on either side of page split.
+ * Callers can rely on the fact that attributes considered equal here are
+ * definitely also equal according to _bt_keep_natts.
+ */
+int
+_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
+{
+       TupleDesc       itupdesc = RelationGetDescr(rel);
+       int                     keysz = IndexRelationGetNumberOfKeyAttributes(rel);
+       int                     keepnatts;
+
+       keepnatts = 1;
+       for (int attnum = 1; attnum <= keysz; attnum++)
+       {
+               Datum           datum1,
+                                       datum2;
+               bool            isNull1,
+                                       isNull2;
+               Form_pg_attribute att;
+
+               datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
+               datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
+               att = TupleDescAttr(itupdesc, attnum - 1);
+
+               if (isNull1 != isNull2)
+                       break;
+
+               if (!isNull1 &&
+                       !datumIsEqual(datum1, datum2, att->attbyval, att->attlen))
+                       break;
+
+               keepnatts++;
+       }
+
+       return keepnatts;
+}
+
  /*
   *  _bt_check_natts() -- Verify tuple has expected number of attributes.
   *
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h

index f985a056662425ee5eabd32aaa336f0240ae9c1f..e5876982a22ac006b20598587ac57f7e2b586028 100644 (file)
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -160,11 +160,15 @@ typedef struct BTMetaPageData
   * For pages above the leaf level, we use a fixed 70% fillfactor.
   * The fillfactor is applied during index build and when splitting
   * a rightmost page; when splitting non-rightmost pages we try to
- * divide the data equally.
+ * divide the data equally.  When splitting a page that's entirely
+ * filled with a single value (duplicates), the effective leaf-page
+ * fillfactor is 96%, regardless of whether the page is a rightmost
+ * page.
   */
  #define BTREE_MIN_FILLFACTOR           10
  #define BTREE_DEFAULT_FILLFACTOR       90
  #define BTREE_NONLEAF_FILLFACTOR       70
+#define BTREE_SINGLEVAL_FILLFACTOR     96
  
  /*
   *     In general, the btree code tries to localize its knowledge about
@@ -711,6 +715,13 @@ extern bool _bt_doinsert(Relation rel, IndexTuple itup,
  extern Buffer _bt_getstackbuf(Relation rel, BTStack stack);
  extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
  
+/*
+ * prototypes for functions in nbtsplitloc.c
+ */
+extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
+                                OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+                                bool *newitemonleft);
+
  /*
   * prototypes for functions in nbtpage.c
   */
@@ -777,6 +788,8 @@ extern bool btproperty(Oid index_oid, int attno,
                    bool *res, bool *isnull);
  extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
                          IndexTuple firstright, BTScanInsert itup_key);
+extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft,
+                                       IndexTuple firstright);
  extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
                                 OffsetNumber offnum);
  extern void _bt_check_third_page(Relation rel, Relation heap,
author	Peter Geoghegan <pg@bowt.ie>
	Wed, 20 Mar 2019 17:12:19 +0000 (10:12 -0700)
committer	Peter Geoghegan <pg@bowt.ie>
	Wed, 20 Mar 2019 17:12:19 +0000 (10:12 -0700)
src/backend/access/nbtree/Makefile		patch \| blob \| history
src/backend/access/nbtree/README		patch \| blob \| history
src/backend/access/nbtree/nbtinsert.c		patch \| blob \| history
src/backend/access/nbtree/nbtsplitloc.c	[new file with mode: 0644]	patch \| blob
src/backend/access/nbtree/nbtutils.c		patch \| blob \| history
src/include/access/nbtree.h		patch \| blob \| history