]> granicus.if.org Git - postgresql/commitdiff
Compress GIN posting lists, for smaller index size.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 22 Jan 2014 16:51:48 +0000 (18:51 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 22 Jan 2014 17:20:58 +0000 (19:20 +0200)
GIN posting lists are now encoded using varbyte-encoding, which allows them
to fit in much smaller space than the straight ItemPointer array format used
before. The new encoding is used for both the lists stored in-line in entry
tree items, and in posting tree leaf pages.

To maintain backwards-compatibility and keep pg_upgrade working, the code
can still read old-style pages and tuples. Posting tree leaf pages in the
new format are flagged with GIN_COMPRESSED flag, to distinguish old and new
format pages. Likewise, entry tree tuples in the new format have a
GIN_ITUP_COMPRESSED flag set in a bit that was previously unused.

This patch bumps GIN_CURRENT_VERSION from 1 to 2. New indexes created with
version 9.4 will therefore have version number 2 in the metapage, while old
pg_upgraded indexes will have version 1. The code treats them the same, but
it might be come handy in the future, if we want to drop support for the
uncompressed format.

Alexander Korotkov and me. Reviewed by Tomas Vondra and Amit Langote.

13 files changed:
contrib/pgstattuple/expected/pgstattuple.out
src/backend/access/gin/README
src/backend/access/gin/ginbtree.c
src/backend/access/gin/gindatapage.c
src/backend/access/gin/ginentrypage.c
src/backend/access/gin/ginfast.c
src/backend/access/gin/ginget.c
src/backend/access/gin/gininsert.c
src/backend/access/gin/ginpostinglist.c
src/backend/access/gin/ginvacuum.c
src/backend/access/gin/ginxlog.c
src/backend/access/rmgrdesc/gindesc.c
src/include/access/gin_private.h

index 7aab96766539c45a4596c17b8bf40fd24e3984a1..d769f6d49431f9df7fcf25cbbb7e8a1e361c39ec 100644 (file)
@@ -123,6 +123,6 @@ create index test_ginidx on test using gin (b);
 select * from pgstatginindex('test_ginidx');
  version | pending_pages | pending_tuples 
 ---------+---------------+----------------
-       1 |             0 |              0
+       2 |             0 |              0
 (1 row)
 
index 1c1c7b643363b966b45f76da5e8d31a059b64a10..3f0c3e2de863fc2ba81e4f05fe9bff3912178ea1 100644 (file)
@@ -135,15 +135,15 @@ same category of null entry are merged into one index entry just as happens
 with ordinary key entries.
 
 * In a key entry at the btree leaf level, at the next SHORTALIGN boundary,
-there is an array of zero or more ItemPointers, which store the heap tuple
-TIDs for which the indexable items contain this key.  This is called the
-"posting list".  The TIDs in a posting list must appear in sorted order.
-If the list would be too big for the index tuple to fit on an index page,
-the ItemPointers are pushed out to a separate posting page or pages, and
-none appear in the key entry itself.  The separate pages are called a
-"posting tree"; they are organized as a btree of ItemPointer values.
-Note that in either case, the ItemPointers associated with a key can
-easily be read out in sorted order; this is relied on by the scan
+there is a list of item pointers, in compressed format (see Posting List
+Compression section), pointing to the heap tuples for which the indexable
+items contain this key. This is called the "posting list".
+
+If the list would be too big for the index tuple to fit on an index page, the
+ItemPointers are pushed out to a separate posting page or pages, and none
+appear in the key entry itself.  The separate pages are called a "posting
+tree" (see below); Note that in either case, the ItemPointers associated with
+a key can easily be read out in sorted order; this is relied on by the scan
 algorithms.
 
 * The index tuple header fields of a leaf key entry are abused as follows:
@@ -163,6 +163,11 @@ algorithms.
 
 * The posting list can be accessed with GinGetPosting(itup)
 
+* If GinITupIsCompressed(itup), the posting list is stored in compressed
+  format. Otherwise it is just an array of ItemPointers. New tuples are always
+  stored in compressed format, uncompressed items can be present if the
+  database was migrated from 9.3 or earlier version.
+
 2) Posting tree case:
 
 * ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number
@@ -210,6 +215,76 @@ fit on one pending-list page must have those pages to itself, even if this
 results in wasting much of the space on the preceding page and the last
 page for the tuple.)
 
+Posting tree
+------------
+
+If a posting list is too large to store in-line in a key entry, a posting tree
+is created. A posting tree is a B-tree structure, where the ItemPointer is
+used as the key.
+
+Internal posting tree pages use the standard PageHeader and the same "opaque"
+struct as other GIN page, but do not contain regular index tuples. Instead,
+the contents of the page is an array of PostingItem structs. Each PostingItem
+consists of the block number of the child page, and the right bound of that
+child page, as an ItemPointer. The right bound of the page is stored right
+after the page header, before the PostingItem array.
+
+Posting tree leaf pages also use the standard PageHeader and opaque struct,
+and the right bound of the page is stored right after the page header,
+but the page content comprises of 0-32 compressed posting lists, and an
+additional array of regular uncompressed item pointers. The compressed posting
+lists are stored one after each other, between page header and pd_lower. The
+uncompressed array is stored between pd_upper and pd_special. The space
+between pd_lower and pd_upper is unused, which allows full-page images of
+posting tree leaf pages to skip the unused space in middle (buffer_std = true
+in XLogRecData). For historical reasons, this does not apply to internal
+pages, or uncompressed leaf pages migrated from earlier versions.
+
+The item pointers are stored in a number of independent compressed posting
+lists (also called segments), instead of one big one, to make random access
+to a given item pointer faster: to find an item in a compressed list, you
+have to read the list from the beginning, but when the items are split into
+multiple lists, you can first skip over to the list containing the item you're
+looking for, and read only that segment. Also, an update only needs to
+re-encode the affected segment.
+
+The uncompressed items array is used for insertions, to avoid re-encoding
+a compressed list on every update. If there is room on a page, an insertion
+simply inserts the new item to the right place in the uncompressed array.
+When a page becomes full, it is rewritten, merging all the uncompressed items
+are into the compressed lists.  When reading, the uncompressed array and the
+compressed lists are read in tandem, and merged into one stream of sorted
+item pointers.
+
+Posting List Compression
+------------------------
+
+To fit as many item pointers on a page as possible, posting tree leaf pages
+and posting lists stored inline in entry tree leaf tuples use a lightweight
+form of compression. We take advantage of the fact that the item pointers
+are stored in sorted order. Instead of storing the block and offset number of
+each item pointer separately, we store the difference from the previous item.
+That in itself doesn't do much, but it allows us to use so-called varbyte
+encoding to compress them.
+
+Varbyte encoding is a method to encode integers, allowing smaller numbers to
+take less space at the cost of larger numbers. Each integer is represented by
+variable number of bytes. High bit of each byte in varbyte encoding determines
+whether the next byte is still part of this number. Therefore, to read a single
+varbyte encoded number, you have to read bytes until you find a byte with the
+high bit not set.
+
+When encoding, the block and offset number forming the item pointer are
+combined into a single integer. The offset number is stored in the 11 low
+bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number
+is stored in the higher bits. That requires 43 bits in total, which
+conveniently fits in at most 6 bytes.
+
+A compressed posting list is passed around and stored on disk in a
+PackedPostingList struct. The first item in the list is stored uncompressed
+as a regular ItemPointerData, followed by the length of the list in bytes,
+followed by the packed items.
+
 Concurrency
 -----------
 
@@ -260,6 +335,36 @@ page-deletions safe; it stamps the deleted pages with an XID and keeps the
 deleted pages around with the right-link intact until all concurrent scans
 have finished.)
 
+Compatibility
+-------------
+
+Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in
+uncompressed format because of pg_upgrade from 9.3 or earlier versions.
+For compatibility, old uncompressed format is also supported. Following
+rules are used to handle it:
+
+* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list.
+This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid).
+Use GinItupIsCompressed(itup) to check the flag.
+
+* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag.
+  Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to
+  check and set this flag.
+
+* All scan operations check format of posting list add use corresponding code
+to read its content.
+
+* When updating an index tuple containing an uncompressed posting list, it
+will be replaced with new index tuple containing a compressed list.
+
+* When updating an uncompressed posting tree leaf page, it's compressed.
+
+* If vacuum finds some dead TIDs in uncompressed posting lists, they are
+converted into compressed posting lists. This assumes that the compressed
+posting list fits in the space occupied by the uncompressed list. IOW, we
+assume that the compressed version of the page, with the dead items removed,
+takes less space than the old uncompressed version.
+
 Limitations
 -----------
 
index 813f96a521bfb7b77fe83cdc864c6eac6e0264ab..c17c90fbc2ef82fc7df3833aeb22b9089d5fceb9 100644 (file)
@@ -325,9 +325,10 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
 {
        Page            page = BufferGetPage(stack->buffer);
        XLogRecData *payloadrdata;
-       bool            fit;
+       GinPlaceToPageRC rc;
        uint16          xlflags = 0;
        Page            childpage = NULL;
+       Page            newlpage = NULL, newrpage = NULL;
 
        if (GinPageIsData(page))
                xlflags |= GIN_INSERT_ISDATA;
@@ -345,16 +346,17 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
        }
 
        /*
-        * Try to put the incoming tuple on the page. If it doesn't fit,
-        * placeToPage method will return false and leave the page unmodified, and
-        * we'll have to split the page.
+        * Try to put the incoming tuple on the page. placeToPage will decide
+        * if the page needs to be split.
         */
-       START_CRIT_SECTION();
-       fit = btree->placeToPage(btree, stack->buffer, stack->off,
-                                                        insertdata, updateblkno,
-                                                        &payloadrdata);
-       if (fit)
+       rc = btree->placeToPage(btree, stack->buffer, stack,
+                                                       insertdata, updateblkno,
+                                                       &payloadrdata, &newlpage, &newrpage);
+       if (rc == UNMODIFIED)
+               return true;
+       else if (rc == INSERTED)
        {
+               /* placeToPage did START_CRIT_SECTION() */
                MarkBufferDirty(stack->buffer);
 
                /* An insert to an internal page finishes the split of the child. */
@@ -373,7 +375,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
 
                        xlrec.node = btree->index->rd_node;
                        xlrec.blkno = BufferGetBlockNumber(stack->buffer);
-                       xlrec.offset = stack->off;
                        xlrec.flags = xlflags;
 
                        rdata[0].buffer = InvalidBuffer;
@@ -415,20 +416,16 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
 
                return true;
        }
-       else
+       else if (rc == SPLIT)
        {
                /* Didn't fit, have to split */
                Buffer          rbuffer;
-               Page            newlpage;
                BlockNumber savedRightLink;
-               Page            rpage;
                XLogRecData rdata[2];
                ginxlogSplit data;
                Buffer          lbuffer = InvalidBuffer;
                Page            newrootpg = NULL;
 
-               END_CRIT_SECTION();
-
                rbuffer = GinNewBuffer(btree->index);
 
                /* During index build, count the new page */
@@ -443,12 +440,9 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                savedRightLink = GinPageGetOpaque(page)->rightlink;
 
                /*
-                * newlpage is a pointer to memory page, it is not associated with a
-                * buffer. stack->buffer is not touched yet.
+                * newlpage and newrpage are pointers to memory pages, not associated
+                * with buffers. stack->buffer is not touched yet.
                 */
-               newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off,
-                                                                       insertdata, updateblkno,
-                                                                       &payloadrdata);
 
                data.node = btree->index->rd_node;
                data.rblkno = BufferGetBlockNumber(rbuffer);
@@ -481,8 +475,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                else
                        rdata[0].next = payloadrdata;
 
-               rpage = BufferGetPage(rbuffer);
-
                if (stack->parent == NULL)
                {
                        /*
@@ -508,7 +500,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                        data.lblkno = BufferGetBlockNumber(lbuffer);
                        data.flags |= GIN_SPLIT_ROOT;
 
-                       GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber;
+                       GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber;
                        GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
 
                        /*
@@ -517,12 +509,12 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                         * than overwriting the original page directly, so that we can still
                         * abort gracefully if this fails.)
                         */
-                       newrootpg = PageGetTempPage(rpage);
-                       GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF, BLCKSZ);
+                       newrootpg = PageGetTempPage(newrpage);
+                       GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ);
 
                        btree->fillRoot(btree, newrootpg,
                                                        BufferGetBlockNumber(lbuffer), newlpage,
-                                                       BufferGetBlockNumber(rbuffer), rpage);
+                                                       BufferGetBlockNumber(rbuffer), newrpage);
                }
                else
                {
@@ -530,7 +522,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                        data.rrlink = savedRightLink;
                        data.lblkno = BufferGetBlockNumber(stack->buffer);
 
-                       GinPageGetOpaque(rpage)->rightlink = savedRightLink;
+                       GinPageGetOpaque(newrpage)->rightlink = savedRightLink;
                        GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT;
                        GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer);
                }
@@ -550,16 +542,24 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                START_CRIT_SECTION();
 
                MarkBufferDirty(rbuffer);
+               MarkBufferDirty(stack->buffer);
 
+               /*
+                * Restore the temporary copies over the real buffers. But don't free
+                * the temporary copies yet, WAL record data points to them.
+                */
                if (stack->parent == NULL)
                {
-                       PageRestoreTempPage(newlpage, BufferGetPage(lbuffer));
                        MarkBufferDirty(lbuffer);
-                       newlpage = newrootpg;
+                       memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ);
+                       memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ);
+                       memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
+               }
+               else
+               {
+                       memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ);
+                       memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ);
                }
-
-               PageRestoreTempPage(newlpage, BufferGetPage(stack->buffer));
-               MarkBufferDirty(stack->buffer);
 
                /* write WAL record */
                if (RelationNeedsWAL(btree->index))
@@ -568,7 +568,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
 
                        recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata);
                        PageSetLSN(BufferGetPage(stack->buffer), recptr);
-                       PageSetLSN(rpage, recptr);
+                       PageSetLSN(BufferGetPage(rbuffer), recptr);
                        if (stack->parent == NULL)
                                PageSetLSN(BufferGetPage(lbuffer), recptr);
                }
@@ -582,6 +582,11 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                if (stack->parent == NULL)
                        UnlockReleaseBuffer(lbuffer);
 
+               pfree(newlpage);
+               pfree(newrpage);
+               if (newrootpg)
+                       pfree(newrootpg);
+
                /*
                 * If we split the root, we're done. Otherwise the split is not
                 * complete until the downlink for the new page has been inserted to
@@ -592,6 +597,8 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
                else
                        return false;
        }
+       else
+               elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc);
 }
 
 /*
index 164d3ba00e608c415fc83f6f192e053f1910259e..8504f4c7afc8b9e47783e2f1953a5f07712c2bd0 100644 (file)
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * gindatapage.c
- *       page utilities routines for the postgres inverted index access method.
+ *       routines for handling GIN posting tree pages.
  *
  *
  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "access/heapam_xlog.h"
+#include "lib/ilist.h"
 #include "miscadmin.h"
+#include "utils/memutils.h"
 #include "utils/rel.h"
 
 /*
- * Checks, should we move to right link...
- * Compares inserting itemp pointer with right bound of current page
+ * Size of the posting lists stored on leaf pages, in bytes. The code can
+ * deal with any size, but random access is more efficient when a number of
+ * smaller lists are stored, rather than one big list.
+ */
+#define GinPostingListSegmentMaxSize 256
+
+/*
+ * Existing posting lists smaller than this are recompressed, when inserting
+ * new items to page.
+ */
+#define GinPostingListSegmentMinSize 192
+
+/*
+ * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
+ * long segment. This is used when estimating how much space is required
+ * for N items, at minimum.
+ */
+#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6)
+
+/*
+ * A working struct for manipulating a posting tree leaf page.
+ */
+typedef struct
+{
+       dlist_head      segments;               /* a list of leafSegmentInfos */
+
+       /*
+        * The following fields represent how the segments are split across
+        * pages, if a page split is required. Filled in by leafRepackItems.
+        */
+       dlist_node *lastleft;           /* last segment on left page */
+       int                     lsize;                  /* total size on left page */
+       int                     rsize;                  /* total size on right page */
+} disassembledLeaf;
+
+typedef struct
+{
+       dlist_node      node;           /* linked list pointers */
+
+       /*
+        * The following fields represent the items in this segment.
+        * If 'items' is not NULL, it contains a palloc'd array of the items
+        * in this segment. If 'seg' is not NULL, it contains the items in an
+        * already-compressed format. It can point to an on-disk page (!modified),
+        * or a palloc'd segment in memory. If both are set, they must represent
+        * the same items.
+        */
+       GinPostingList *seg;
+       ItemPointer items;
+       int                     nitems;                 /* # of items in 'items', if items != NULL */
+
+       bool            modified;               /* is this segment on page already? */
+} leafSegmentInfo;
+
+static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
+static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+                                         GinBtreeStack *stack,
+                                         void *insertdata, BlockNumber updateblkno,
+                                         XLogRecData **prdata, Page *newlpage, Page *newrpage);
+
+static disassembledLeaf *disassembleLeaf(Page page);
+static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
+static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems);
+
+
+static void dataPlaceToPageLeafRecompress(Buffer buf,
+                                                         disassembledLeaf *leaf,
+                                                         XLogRecData **prdata);
+static void dataPlaceToPageLeafSplit(Buffer buf,
+                                                disassembledLeaf *leaf,
+                                                ItemPointerData lbound, ItemPointerData rbound,
+                                                XLogRecData **prdata, Page lpage, Page rpage);
+
+/*
+ * Read all TIDs from leaf data page to single uncompressed array.
+ */
+ItemPointer
+GinDataLeafPageGetItems(Page page, int *nitems)
+{
+       ItemPointer result;
+
+       if (GinPageIsCompressed(page))
+       {
+               GinPostingList *ptr = GinDataLeafPageGetPostingList(page);
+               Size            len = GinDataLeafPageGetPostingListSize(page);
+
+               result = ginPostingListDecodeAllSegments(ptr, len, nitems);
+       }
+       else
+       {
+               ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems);
+
+               result = palloc((*nitems) * sizeof(ItemPointerData));
+               memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData));
+       }
+
+       return result;
+}
+
+/*
+ * Places all TIDs from leaf data page to bitmap.
+ */
+int
+GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm)
+{
+       ItemPointer uncompressed;
+       int                     nitems;
+
+       if (GinPageIsCompressed(page))
+       {
+               GinPostingList *segment = GinDataLeafPageGetPostingList(page);
+               Size            len = GinDataLeafPageGetPostingListSize(page);
+
+               nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm);
+       }
+       else
+       {
+               uncompressed = dataLeafPageGetUncompressed(page, &nitems);
+
+               if (nitems > 0)
+                       tbm_add_tuples(tbm, uncompressed, nitems, false);
+       }
+
+       return nitems;
+}
+
+/*
+ * Get pointer to the uncompressed array of items on a pre-9.4 format
+ * uncompressed leaf page. The number of items in the array is returned in
+ * *nitems.
+ */
+static ItemPointer
+dataLeafPageGetUncompressed(Page page, int *nitems)
+{
+       ItemPointer items;
+
+       Assert(!GinPageIsCompressed(page));
+
+       /*
+        * In the old pre-9.4 page format, the whole page content is used for
+        * uncompressed items, and the number of items is stored in 'maxoff'
+        */
+       items = (ItemPointer) GinDataPageGetData(page);
+       *nitems = GinPageGetOpaque(page)->maxoff;
+
+       return items;
+}
+
+/*
+ * Check if we should follow the right link to find the item we're searching
+ * for.
+ *
+ * Compares inserting item pointer with the right bound of the current page.
  */
 static bool
 dataIsMoveRight(GinBtree btree, Page page)
@@ -34,8 +188,8 @@ dataIsMoveRight(GinBtree btree, Page page)
 }
 
 /*
- * Find correct PostingItem in non-leaf page. It supposed that page
- * correctly chosen and searching value SHOULD be on page
+ * Find correct PostingItem in non-leaf page. It is assumed that this is
+ * the correct page, and the searched value SHOULD be on the page.
  */
 static BlockNumber
 dataLocateItem(GinBtree btree, GinBtreeStack *stack)
@@ -102,63 +256,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack)
 }
 
 /*
- * Searches correct position for value on leaf page.
- * Page should be correctly chosen.
- * Returns true if value found on page.
- */
-static bool
-dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack)
-{
-       Page            page = BufferGetPage(stack->buffer);
-       OffsetNumber low,
-                               high;
-       int                     result;
-
-       Assert(GinPageIsLeaf(page));
-       Assert(GinPageIsData(page));
-
-       if (btree->fullScan)
-       {
-               stack->off = FirstOffsetNumber;
-               return TRUE;
-       }
-
-       low = FirstOffsetNumber;
-       high = GinPageGetOpaque(page)->maxoff;
-
-       if (high < low)
-       {
-               stack->off = FirstOffsetNumber;
-               return false;
-       }
-
-       high++;
-
-       while (high > low)
-       {
-               OffsetNumber mid = low + ((high - low) / 2);
-
-               result = ginCompareItemPointers(&btree->itemptr,
-                                                                               GinDataPageGetItemPointer(page, mid));
-
-               if (result == 0)
-               {
-                       stack->off = mid;
-                       return true;
-               }
-               else if (result > 0)
-                       low = mid + 1;
-               else
-                       high = mid;
-       }
-
-       stack->off = high;
-       return false;
-}
-
-/*
- * Finds links to blkno on non-leaf page, returns
- * offset of PostingItem
+ * Find link to blkno on non-leaf page, returns offset of PostingItem
  */
 static OffsetNumber
 dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
@@ -203,7 +301,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor
 }
 
 /*
- * returns blkno of leftmost child
+ * Return blkno of leftmost child
  */
 static BlockNumber
 dataGetLeftMostPage(GinBtree btree, Page page)
@@ -219,36 +317,7 @@ dataGetLeftMostPage(GinBtree btree, Page page)
 }
 
 /*
- * add ItemPointer to a leaf page.
- */
-void
-GinDataPageAddItemPointer(Page page, ItemPointer data, OffsetNumber offset)
-{
-       OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
-       char       *ptr;
-
-       Assert(ItemPointerIsValid(data));
-       Assert(GinPageIsLeaf(page));
-
-       if (offset == InvalidOffsetNumber)
-       {
-               ptr = (char *) GinDataPageGetItemPointer(page, maxoff + 1);
-       }
-       else
-       {
-               ptr = (char *) GinDataPageGetItemPointer(page, offset);
-               if (maxoff + 1 - offset != 0)
-                       memmove(ptr + sizeof(ItemPointerData),
-                                       ptr,
-                                       (maxoff - offset + 1) * sizeof(ItemPointerData));
-       }
-       memcpy(ptr, data, sizeof(ItemPointerData));
-
-       GinPageGetOpaque(page)->maxoff++;
-}
-
-/*
- * add PostingItem to a non-leaf page.
+ * Add PostingItem to a non-leaf page.
  */
 void
 GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
@@ -266,7 +335,7 @@ GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
        else
        {
                ptr = (char *) GinDataPageGetPostingItem(page, offset);
-               if (maxoff + 1 - offset != 0)
+               if (offset != maxoff + 1)
                        memmove(ptr + sizeof(PostingItem),
                                        ptr,
                                        (maxoff - offset + 1) * sizeof(PostingItem));
@@ -277,7 +346,7 @@ GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
 }
 
 /*
- * Deletes posting item from non-leaf page
+ * Delete posting item from non-leaf page
  */
 void
 GinPageDeletePostingItem(Page page, OffsetNumber offset)
@@ -296,269 +365,725 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset)
 }
 
 /*
- * checks space to install new value,
- * item pointer never deletes!
+ * Places keys to leaf data page and fills WAL record.
  */
-static bool
-dataIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off, void *insertdata)
+static GinPlaceToPageRC
+dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+                                       void *insertdata, XLogRecData **prdata,
+                                       Page *newlpage, Page *newrpage)
 {
+       GinBtreeDataLeafInsertData *items = insertdata;
+       ItemPointer newItems = &items->items[items->curitem];
+       int                     maxitems = items->nitem - items->curitem;
        Page            page = BufferGetPage(buf);
+       int                     i;
+       ItemPointerData rbound;
+       ItemPointerData lbound;
+       bool            needsplit;
+       bool            append;
+       int                     segsize;
+       Size            freespace;
+       MemoryContext tmpCxt;
+       MemoryContext oldCxt;
+       disassembledLeaf *leaf;
+       leafSegmentInfo *lastleftinfo;
+       ItemPointerData maxOldItem;
+       ItemPointerData remaining;
 
        Assert(GinPageIsData(page));
 
-       if (GinPageIsLeaf(page))
-       {
-               GinBtreeDataLeafInsertData *items = insertdata;
+       rbound  = *GinDataPageGetRightBound(page);
 
-               if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
+       /*
+        * Count how many of the new items belong to this page.
+        */
+       if (!GinPageRightMost(page))
+       {
+               for (i = 0; i < maxitems; i++)
                {
-                       if ((items->nitem - items->curitem) * sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
-                               return true;
+                       if (ginCompareItemPointers(&newItems[i], &rbound) > 0)
+                       {
+                               /*
+                                * This needs to go to some other location in the tree. (The
+                                * caller should've chosen the insert location so that at least
+                                * the first item goes here.)
+                                */
+                               Assert(i > 0);
+                               break;
+                       }
                }
-               else if (sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page))
-                       return true;
+               maxitems = i;
        }
-       else if (sizeof(PostingItem) <= GinDataPageGetFreeSpace(page))
-               return true;
 
-       return false;
-}
+       /*
+        * The following operations do quite a lot of small memory allocations,
+        * create a temporary memory context so that we don't need to keep track
+        * of them individually.
+        */
+       tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
+                                                                  "Gin split temporary context",
+                                                                  ALLOCSET_DEFAULT_MINSIZE,
+                                                                  ALLOCSET_DEFAULT_INITSIZE,
+                                                                  ALLOCSET_DEFAULT_MAXSIZE);
+       oldCxt = MemoryContextSwitchTo(tmpCxt);
 
-/*
- * Places keys to page and fills WAL record. In case leaf page and
- * build mode puts all ItemPointers to page.
- *
- * If none of the keys fit, returns false without modifying the page.
- *
- * On insertion to an internal node, in addition to inserting the given item,
- * the downlink of the existing item at 'off' is updated to point to
- * 'updateblkno'.
- */
-static bool
-dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
-                               void *insertdata, BlockNumber updateblkno,
-                               XLogRecData **prdata)
-{
-       Page            page = BufferGetPage(buf);
-       /* these must be static so they can be returned to caller */
-       static XLogRecData rdata[2];
+       leaf = disassembleLeaf(page);
 
-       /* quick exit if it doesn't fit */
-       if (!dataIsEnoughSpace(btree, buf, off, insertdata))
-               return false;
+       /*
+        * Are we appending to the end of the page? IOW, are all the new items
+        * larger than any of the existing items.
+        */
+       if (!dlist_is_empty(&leaf->segments))
+       {
+               lastleftinfo = dlist_container(leafSegmentInfo, node,
+                                                                          dlist_tail_node(&leaf->segments));
+               if (!lastleftinfo->items)
+                       lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+                                                                                                          &lastleftinfo->nitems);
+               maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1];
+               if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0)
+                       append = true;
+               else
+                       append = false;
+       }
+       else
+       {
+               ItemPointerSetMin(&maxOldItem);
+               append = true;
+       }
 
-       *prdata = rdata;
-       Assert(GinPageIsData(page));
+       /*
+        * If we're appending to the end of the page, we will append as many items
+        * as we can fit (after splitting), and stop when the pages becomes full.
+        * Otherwise we have to limit the number of new items to insert, because
+        * once we start packing we can't just stop when we run out of space,
+        * because we must make sure that all the old items still fit.
+        */
+       if (GinPageIsCompressed(page))
+               freespace = GinDataLeafPageGetFreeSpace(page);
+       else
+               freespace = 0;
+       if (append)
+       {
+               /*
+                * Even when appending, trying to append more items than will fit is
+                * not completely free, because we will merge the new items and old
+                * items into an array below. In the best case, every new item fits in
+                * a single byte, and we can use all the free space on the old page as
+                * well as the new page. For simplicity, ignore segment overhead etc.
+                */
+               maxitems = Min(maxitems, freespace + GinDataLeafMaxContentSize);
+       }
+       else
+       {
+               /*
+                * Calculate a conservative estimate of how many new items we can fit
+                * on the two pages after splitting.
+                *
+                * We can use any remaining free space on the old page to store full
+                * segments, as well as the new page. Each full-sized segment can hold
+                * at least MinTuplesPerSegment items
+                */
+               int                     nnewsegments;
 
-       /* Update existing downlink to point to next page (on internal page) */
-       if (!GinPageIsLeaf(page))
+               nnewsegments = freespace / GinPostingListSegmentMaxSize;
+               nnewsegments += GinDataLeafMaxContentSize / GinPostingListSegmentMaxSize;
+               maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
+       }
+
+       /* Add the new items to the segments */
+       if (!addItemsToLeaf(leaf, newItems, maxitems))
        {
-               PostingItem *pitem = GinDataPageGetPostingItem(page, off);
+                /* all items were duplicates, we have nothing to do */
+               items->curitem += maxitems;
+
+               MemoryContextSwitchTo(oldCxt);
+               MemoryContextDelete(tmpCxt);
 
-               PostingItemSetBlockNumber(pitem, updateblkno);
+               return UNMODIFIED;
        }
 
-       if (GinPageIsLeaf(page))
+       /*
+        * Pack the items back to compressed segments, ready for writing to disk.
+        */
+       needsplit = leafRepackItems(leaf, &remaining);
+
+       /*
+        * Did all the new items fit?
+        *
+        * If we're appending, it's OK if they didn't. But as a sanity check,
+        * verify that all the old items fit.
+        */
+       if (ItemPointerIsValid(&remaining))
+       {
+               if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0)
+                       elog(ERROR, "could not split GIN page; all old items didn't fit");
+
+               /* Count how many of the new items did fit. */
+               for (i = 0; i < maxitems; i++)
+               {
+                       if (ginCompareItemPointers(&newItems[i], &remaining) >= 0)
+                               break;
+               }
+               if (i == 0)
+                       elog(ERROR, "could not split GIN page; no new items fit");
+               maxitems = i;
+       }
+
+       if (!needsplit)
        {
-               GinBtreeDataLeafInsertData *items = insertdata;
-               static ginxlogInsertDataLeaf data;
-               uint32          savedPos = items->curitem;
+               /*
+                * Great, all the items fit on a single page. Write the segments to
+                * the page, and WAL-log appropriately.
+                *
+                * Once we start modifying the page, there's no turning back. The
+                * caller is responsible for calling END_CRIT_SECTION() after writing
+                * the WAL record.
+                */
+               START_CRIT_SECTION();
+               dataPlaceToPageLeafRecompress(buf, leaf, prdata);
 
-               if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff)
+               if (append)
+                       elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
+                                maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+                                items->nitem - items->curitem - maxitems);
+               else
+                       elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)",
+                                maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
+                                items->nitem - items->curitem - maxitems);
+       }
+       else
+       {
+               /*
+                * Had to split.
+                *
+                * We already divided the segments between the left and the right
+                * page. The left page was filled as full as possible, and the rest
+                * overflowed to the right page. When building a new index, that's
+                * good, because the table is scanned from beginning to end and there
+                * won't be any more insertions to the left page during the build.
+                * This packs the index as tight as possible. But otherwise, split
+                * 50/50, by moving segments from the left page to the right page
+                * until they're balanced.
+                *
+                * As a further heuristic, when appending items to the end of the
+                * page, split 75/25, one the assumption that subsequent insertions
+                * will probably also go to the end. This packs the index somewhat
+                * tighter when appending to a table, which is very common.
+                */
+               if (!btree->isBuild)
                {
-                       /* usually, create index... */
-                       while (items->curitem < items->nitem)
+                       while (dlist_has_prev(&leaf->segments, leaf->lastleft))
                        {
-                               GinDataPageAddItemPointer(page, items->items + items->curitem, off);
-                               off++;
-                               items->curitem++;
+                               lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+
+                               segsize = SizeOfGinPostingList(lastleftinfo->seg);
+                               if (append)
+                               {
+                                       if ((leaf->lsize - segsize) - (leaf->lsize - segsize) < BLCKSZ / 4)
+                                               break;
+                               }
+                               else
+                               {
+                                       if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0)
+                                               break;
+                               }
+
+                                /* don't consider segments moved to right as unmodified */
+                               lastleftinfo->modified = true;
+                               leaf->lsize -= segsize;
+                               leaf->rsize += segsize;
+                               leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
                        }
-                       data.nitem = items->curitem - savedPos;
                }
+               Assert(leaf->lsize <= GinDataLeafMaxContentSize);
+               Assert(leaf->rsize <= GinDataLeafMaxContentSize);
+
+               /*
+                * Fetch the max item in the left page's last segment; it becomes the
+                * right bound of the page.
+                */
+               lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
+               if (!lastleftinfo->items)
+                       lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
+                                                                                                          &lastleftinfo->nitems);
+               lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
+
+               *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ);
+               *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ);
+
+               dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound,
+                                                                prdata, *newlpage, *newrpage);
+
+               Assert(GinPageRightMost(page) ||
+                          ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
+                                                                         GinDataPageGetRightBound(*newrpage)) < 0);
+
+               if (append)
+                       elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
+                                maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+                                items->nitem - items->curitem - maxitems);
+               else
+                       elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)",
+                                maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
+                                items->nitem - items->curitem - maxitems);
+       }
+
+       MemoryContextSwitchTo(oldCxt);
+       MemoryContextDelete(tmpCxt);
+
+       items->curitem += maxitems;
+
+       return needsplit ? SPLIT : INSERTED;
+}
+
+/*
+ * Vacuum a posting tree leaf page.
+ */
+void
+ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
+{
+       Page            page = BufferGetPage(buffer);
+       disassembledLeaf *leaf;
+       bool            removedsomething = false;
+       dlist_iter      iter;
+
+       leaf = disassembleLeaf(page);
+
+       /* Vacuum each segment. */
+       dlist_foreach(iter, &leaf->segments)
+       {
+               leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+               int                     oldsegsize;
+               ItemPointer cleaned;
+               int                     ncleaned;
+
+               if (!seginfo->items)
+                       seginfo->items = ginPostingListDecode(seginfo->seg,
+                                                                                                 &seginfo->nitems);
+               if (seginfo->seg)
+                       oldsegsize = SizeOfGinPostingList(seginfo->seg);
                else
+                       oldsegsize = GinDataLeafMaxContentSize;
+
+               cleaned = ginVacuumItemPointers(gvs,
+                                                                               seginfo->items,
+                                                                               seginfo->nitems,
+                                                                               &ncleaned);
+               pfree(seginfo->items);
+               seginfo->items = NULL;
+               seginfo->nitems = 0;
+               if (cleaned)
                {
-                       GinDataPageAddItemPointer(page, items->items + items->curitem, off);
-                       items->curitem++;
-                       data.nitem = 1;
+                       if (ncleaned > 0)
+                       {
+                               int                     npacked;
+
+                               seginfo->seg = ginCompressPostingList(cleaned,
+                                                                                                         ncleaned,
+                                                                                                         oldsegsize,
+                                                                                                         &npacked);
+                               /* Removing an item never increases the size of the segment */
+                               if (npacked != ncleaned)
+                                       elog(ERROR, "could not fit vacuumed posting list");
+                       }
+                       else
+                       {
+                               seginfo->seg = NULL;
+                               seginfo->items = NULL;
+                       }
+                       seginfo->nitems = ncleaned;
+                       seginfo->modified = true;
+
+                       removedsomething = true;
                }
+       }
 
-               rdata[0].buffer = buf;
-               rdata[0].buffer_std = false;
-               rdata[0].data = (char *) &data;
-               rdata[0].len = offsetof(ginxlogInsertDataLeaf, items);
-               rdata[0].next = &rdata[1];
+       /*
+        * If we removed any items, reconstruct the page from the pieces.
+        *
+        * We don't try to re-encode the segments here, even though some of them
+        * might be really small, now that we've removed some items from them.
+        * It seems like a waste of effort, as there isn't really any benefit from
+        * larger segments per se; larger segments only help you to pack more
+        * items in the same space. We might as well delay doing that until the
+        * next insertion, which will need to re-encode at least part of the page
+        * anyway.
+        *
+        * Also note if the page was in uncompressed, pre-9.4 format before, it
+        * is now represented as one huge segment that contains all the items.
+        * It might make sense to split that, to speed up random access, but we
+        * don't bother. You'll have to REINDEX anyway if you want the full gain
+        * of the new tighter index format.
+        */
+       if (removedsomething)
+       {
+               XLogRecData *payloadrdata;
 
-               rdata[1].buffer = buf;
-               rdata[1].buffer_std = false;
-               rdata[1].data = (char *) &items->items[savedPos];
-               rdata[1].len = sizeof(ItemPointerData) * data.nitem;
-               rdata[1].next = NULL;
+               START_CRIT_SECTION();
+               dataPlaceToPageLeafRecompress(buffer, leaf, &payloadrdata);
+
+               MarkBufferDirty(buffer);
+
+               if (RelationNeedsWAL(indexrel))
+               {
+                       XLogRecPtr      recptr;
+                       XLogRecData rdata;
+                       ginxlogVacuumDataLeafPage xlrec;
+
+                       xlrec.node = indexrel->rd_node;
+                       xlrec.blkno = BufferGetBlockNumber(buffer);
+
+                       rdata.buffer = InvalidBuffer;
+                       rdata.data = (char *) &xlrec;
+                       rdata.len = offsetof(ginxlogVacuumDataLeafPage, data);
+                       rdata.next = payloadrdata;
+
+                       recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata);
+                       PageSetLSN(page, recptr);
+               }
+
+               END_CRIT_SECTION();
        }
-       else
+}
+
+/*
+ * Assemble a disassembled posting tree leaf page back to a buffer.
+ *
+ * *prdata is filled with WAL information about this operation. The caller
+ * is responsible for inserting to the WAL, along with any other information
+ * about the operation that triggered this recompression.
+ */
+static void
+dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf,
+                                                         XLogRecData **prdata)
+{
+       Page            page = BufferGetPage(buf);
+       char       *ptr;
+       int                     newsize;
+       int                     unmodifiedsize;
+       bool            modified = false;
+       dlist_iter      iter;
+       /*
+        * these must be static so they can be returned to caller (no pallocs
+        * since we're in a critical section!)
+        */
+       static ginxlogRecompressDataLeaf recompress_xlog;
+       static XLogRecData rdata[2];
+
+       ptr = (char *) GinDataLeafPageGetPostingList(page);
+       newsize = 0;
+       unmodifiedsize = 0;
+       modified = false;
+       dlist_foreach(iter, &leaf->segments)
        {
-               PostingItem *pitem = insertdata;
+               leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+               int                     segsize;
+
+               if (seginfo->modified)
+                       modified = true;
+
+               /*
+                * Nothing to do with empty segments, except keep track if they've been
+                * modified
+                */
+               if (seginfo->seg == NULL)
+               {
+                       Assert(seginfo->items == NULL);
+                       continue;
+               }
 
-               GinDataPageAddPostingItem(page, pitem, off);
+               segsize = SizeOfGinPostingList(seginfo->seg);
 
-               rdata[0].buffer = buf;
-               rdata[0].buffer_std = false;
-               rdata[0].data = (char *) pitem;
-               rdata[0].len = sizeof(PostingItem);
-               rdata[0].next = NULL;
+               if (!modified)
+                       unmodifiedsize += segsize;
+               else
+                       memcpy(ptr, seginfo->seg, segsize);
+               ptr += segsize;
+               newsize += segsize;
        }
+       Assert(newsize < GinDataLeafMaxContentSize);
+       GinDataLeafPageSetPostingListSize(page, newsize);
+       GinPageSetCompressed(page);      /* in case it was in pre-9.4 format before */
+
+       /* Put WAL data */
+       recompress_xlog.length = (uint16) newsize;
+       recompress_xlog.unmodifiedsize = unmodifiedsize;
+
+       rdata[0].buffer = InvalidBuffer;
+       rdata[0].data = (char *) &recompress_xlog;
+       rdata[0].len = offsetof(ginxlogRecompressDataLeaf, newdata);
+       rdata[0].next = &rdata[1];
+
+       rdata[1].buffer = buf;
+       rdata[1].buffer_std = TRUE;
+       rdata[1].data = ((char *) GinDataLeafPageGetPostingList(page)) + unmodifiedsize;
+       rdata[1].len = newsize - unmodifiedsize;
+       rdata[1].next = NULL;
 
-       return true;
+       *prdata = rdata;
 }
 
 /*
- * split page and fills WAL record. original buffer(lbuf) leaves untouched,
- * returns shadow page of lbuf filled new data. In leaf page and build mode puts all
- * ItemPointers to pages. Also, in build mode splits data by way to full fulled
- * left page
+ * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf
+ * segments to two pages instead of one.
+ *
+ * This is different from the non-split cases in that this does not modify
+ * the original page directly, but to temporary in-memory copies of the new
+ * left and right pages.
  */
-static Page
-dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
-                         void *insertdata, BlockNumber updateblkno, XLogRecData **prdata)
+static void
+dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
+                                                ItemPointerData lbound, ItemPointerData rbound,
+                                                XLogRecData **prdata, Page lpage, Page rpage)
 {
        char       *ptr;
-       OffsetNumber separator;
-       ItemPointer bound;
-       Page            lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
-       bool            isleaf = GinPageIsLeaf(lpage);
-       ItemPointerData oldbound = *GinDataPageGetRightBound(lpage);
-       int                     sizeofitem = GinSizeOfDataPageItem(lpage);
-       OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff;
-       Page            rpage = BufferGetPage(rbuf);
-       Size            pageSize = PageGetPageSize(lpage);
-       Size            freeSpace;
-
+       int                     segsize;
+       int                     lsize;
+       int                     rsize;
+       dlist_node *node;
+       dlist_node *firstright;
+       leafSegmentInfo *seginfo;
        /* these must be static so they can be returned to caller */
-       static ginxlogSplitData data;
-       static XLogRecData rdata[2];
-       static char vector[2 * BLCKSZ];
-
-       GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
-       freeSpace = GinDataPageGetFreeSpace(rpage);
+       static ginxlogSplitDataLeaf split_xlog;
+       static XLogRecData rdata[3];
 
-       *prdata = rdata;
+       /* Initialize temporary pages to hold the new left and right pages */
+       GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+       GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
 
-       /* Update existing downlink to point to next page (on internal page) */
-       if (!isleaf)
+       /*
+        * Copy the segments that go to the left page.
+        *
+        * XXX: We should skip copying the unmodified part of the left page, like
+        * we do when recompressing.
+        */
+       lsize = 0;
+       ptr = (char *) GinDataLeafPageGetPostingList(lpage);
+       firstright = dlist_next_node(&leaf->segments, leaf->lastleft);
+       for (node = dlist_head_node(&leaf->segments);
+                node != firstright;
+                node = dlist_next_node(&leaf->segments, node))
        {
-               PostingItem *pitem = GinDataPageGetPostingItem(lpage, off);
+               seginfo = dlist_container(leafSegmentInfo, node, node);
+               segsize = SizeOfGinPostingList(seginfo->seg);
 
-               PostingItemSetBlockNumber(pitem, updateblkno);
+               memcpy(ptr, seginfo->seg, segsize);
+               ptr += segsize;
+               lsize += segsize;
        }
-
-       if (isleaf)
+       Assert(lsize == leaf->lsize);
+       GinDataLeafPageSetPostingListSize(lpage, lsize);
+       *GinDataPageGetRightBound(lpage) = lbound;
+
+       /* Copy the segments that go to the right page */
+       ptr = (char *) GinDataLeafPageGetPostingList(rpage);
+       rsize = 0;
+       for (node = firstright;
+                ;
+                node = dlist_next_node(&leaf->segments, node))
        {
-               memcpy(vector,
-                          GinDataPageGetItemPointer(lpage, FirstOffsetNumber),
-                          maxoff * sizeof(ItemPointerData));
+               seginfo = dlist_container(leafSegmentInfo, node, node);
+               segsize = SizeOfGinPostingList(seginfo->seg);
+
+               memcpy(ptr, seginfo->seg, segsize);
+               ptr += segsize;
+               rsize += segsize;
+
+               if (!dlist_has_next(&leaf->segments, node))
+                       break;
        }
-       else
+       Assert(rsize == leaf->rsize);
+       GinDataLeafPageSetPostingListSize(rpage, rsize);
+       *GinDataPageGetRightBound(rpage) = rbound;
+
+       /* Create WAL record */
+       split_xlog.lsize = lsize;
+       split_xlog.rsize = rsize;
+       split_xlog.lrightbound = lbound;
+       split_xlog.rrightbound = rbound;
+
+       rdata[0].buffer = InvalidBuffer;
+       rdata[0].data = (char *) &split_xlog;
+       rdata[0].len = sizeof(ginxlogSplitDataLeaf);
+       rdata[0].next = &rdata[1];
+
+       rdata[1].buffer = InvalidBuffer;
+       rdata[1].data = (char *) GinDataLeafPageGetPostingList(lpage);
+       rdata[1].len = lsize;
+       rdata[1].next = &rdata[2];
+
+       rdata[2].buffer = InvalidBuffer;
+       rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage);
+       rdata[2].len = rsize;
+       rdata[2].next = NULL;
+
+       *prdata = rdata;
+}
+
+/*
+ * Place a PostingItem to page, and fill a WAL record.
+ *
+ * If the item doesn't fit, returns false without modifying the page.
+ *
+ * In addition to inserting the given item, the downlink of the existing item
+ * at 'off' is updated to point to 'updateblkno'.
+ */
+static GinPlaceToPageRC
+dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+                                               void *insertdata, BlockNumber updateblkno,
+                                               XLogRecData **prdata, Page *newlpage, Page *newrpage)
+{
+       Page            page = BufferGetPage(buf);
+       OffsetNumber off = stack->off;
+       PostingItem *pitem;
+       /* these must be static so they can be returned to caller */
+       static XLogRecData rdata;
+       static ginxlogInsertDataInternal data;
+
+       /* split if we have to */
+       if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
        {
-               memcpy(vector,
-                          GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
-                          maxoff * sizeof(PostingItem));
+               dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
+                                                         prdata, newlpage, newrpage);
+               return SPLIT;
        }
 
-       if (isleaf && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff)
-       {
-               /* append new items to the end */
-               GinBtreeDataLeafInsertData *items = insertdata;
+       *prdata = &rdata;
+       Assert(GinPageIsData(page));
 
-               while (items->curitem < items->nitem &&
-                          maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData)))
-               {
-                       memcpy(vector + maxoff * sizeof(ItemPointerData),
-                                  items->items + items->curitem,
-                                  sizeof(ItemPointerData));
-                       maxoff++;
-                       items->curitem++;
-               }
-       }
+       START_CRIT_SECTION();
+
+       /* Update existing downlink to point to next page (on internal page) */
+       pitem = GinDataPageGetPostingItem(page, off);
+       PostingItemSetBlockNumber(pitem, updateblkno);
+
+       /* Add new item */
+       pitem = (PostingItem *) insertdata;
+       GinDataPageAddPostingItem(page, pitem, off);
+
+       data.offset = off;
+       data.newitem = *pitem;
+
+       rdata.buffer = buf;
+       rdata.buffer_std = false;
+       rdata.data = (char *) &data;
+       rdata.len = sizeof(ginxlogInsertDataInternal);
+       rdata.next = NULL;
+
+       return INSERTED;
+}
+
+/*
+ * Places an item (or items) to a posting tree. Calls relevant function of
+ * internal of leaf page because they are handled very differently.
+ */
+static GinPlaceToPageRC
+dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
+                               void *insertdata, BlockNumber updateblkno,
+                               XLogRecData **prdata,
+                               Page *newlpage, Page *newrpage)
+{
+       Page            page = BufferGetPage(buf);
+
+       Assert(GinPageIsData(page));
+
+       if (GinPageIsLeaf(page))
+               return dataPlaceToPageLeaf(btree, buf, stack, insertdata,
+                                                                  prdata, newlpage, newrpage);
        else
-       {
-               ptr = vector + (off - 1) * sizeofitem;
-               if (maxoff + 1 - off != 0)
-                       memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem);
-               if (isleaf)
-               {
-                       GinBtreeDataLeafInsertData *items = insertdata;
+               return dataPlaceToPageInternal(btree, buf, stack,
+                                                                          insertdata, updateblkno,
+                                                                          prdata, newlpage, newrpage);
+}
 
-                       memcpy(ptr, items->items + items->curitem, sizeofitem);
-                       items->curitem++;
-               }
-               else
-               {
-                       PostingItem *pitem = insertdata;
+/*
+ * Split page and fill WAL record. Returns a new temp buffer filled with data
+ * that should go to the left page. The original buffer is left untouched.
+ */
+static void
+dataSplitPageInternal(GinBtree btree, Buffer origbuf,
+                                         GinBtreeStack *stack,
+                                         void *insertdata, BlockNumber updateblkno,
+                                         XLogRecData **prdata, Page *newlpage, Page *newrpage)
+{
+       Page            oldpage = BufferGetPage(origbuf);
+       OffsetNumber off = stack->off;
+       int                     nitems = GinPageGetOpaque(oldpage)->maxoff;
+       Size            pageSize = PageGetPageSize(oldpage);
+       ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
+       ItemPointer     bound;
+       Page            lpage;
+       Page            rpage;
+       OffsetNumber separator;
 
-                       memcpy(ptr, pitem, sizeofitem);
-               }
+       /* these must be static so they can be returned to caller */
+       static ginxlogSplitDataInternal data;
+       static XLogRecData rdata[4];
+       static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
 
-               maxoff++;
-       }
+       lpage = PageGetTempPage(oldpage);
+       rpage = PageGetTempPage(oldpage);
+       GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+       GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
+
+       *prdata = rdata;
 
        /*
-        * we assume that during index creation the table scanned from beginning
-        * to end, so ItemPointers are in monotonically increasing order.
+        * First construct a new list of PostingItems, which includes all the
+        * old items, and the new item.
         */
-       if (btree->isBuild && GinPageRightMost(lpage))
-               separator = freeSpace / sizeofitem;
-       else
-               separator = maxoff / 2;
+       memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
+                  (off - 1) * sizeof(PostingItem));
 
-       GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
-       GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
+       allitems[off - 1] = *((PostingItem *) insertdata);
+       memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off),
+                  (nitems - (off - 1)) * sizeof(PostingItem));
+       nitems++;
 
-       if (isleaf)
-               memcpy(GinDataPageGetItemPointer(lpage, FirstOffsetNumber),
-                          vector, separator * sizeof(ItemPointerData));
-       else
-               memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
-                          vector, separator * sizeof(PostingItem));
+       /* Update existing downlink to point to next page */
+       PostingItemSetBlockNumber(&allitems[off], updateblkno);
 
-       GinPageGetOpaque(lpage)->maxoff = separator;
-       if (isleaf)
-               memcpy(GinDataPageGetItemPointer(rpage, FirstOffsetNumber),
-                          vector + separator * sizeof(ItemPointerData),
-                          (maxoff - separator) * sizeof(ItemPointerData));
+       /*
+        * When creating a new index, fit as many tuples as possible on the left
+        * page, on the assumption that the table is scanned from beginning to
+        * end. This packs the index as tight as possible.
+        */
+       if (btree->isBuild && GinPageRightMost(oldpage))
+               separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem);
        else
-               memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
-                          vector + separator * sizeof(PostingItem),
-                          (maxoff - separator) * sizeof(PostingItem));
+               separator = nitems / 2;
 
-       GinPageGetOpaque(rpage)->maxoff = maxoff - separator;
+       memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber), allitems, separator * sizeof(PostingItem));
+       GinPageGetOpaque(lpage)->maxoff = separator;
+       memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
+                &allitems[separator], (nitems - separator) * sizeof(PostingItem));
+       GinPageGetOpaque(rpage)->maxoff = nitems - separator;
 
        /* set up right bound for left page */
        bound = GinDataPageGetRightBound(lpage);
-       if (GinPageIsLeaf(lpage))
-               *bound = *GinDataPageGetItemPointer(lpage,
-                                                                                       GinPageGetOpaque(lpage)->maxoff);
-       else
-               *bound = GinDataPageGetPostingItem(lpage,
-                                                                          GinPageGetOpaque(lpage)->maxoff)->key;
+       *bound = GinDataPageGetPostingItem(lpage,
+                                                                 GinPageGetOpaque(lpage)->maxoff)->key;
 
        /* set up right bound for right page */
-       bound = GinDataPageGetRightBound(rpage);
-       *bound = oldbound;
+       *GinDataPageGetRightBound(rpage) = oldbound;
 
        data.separator = separator;
-       data.nitem = maxoff;
+       data.nitem = nitems;
        data.rightbound = oldbound;
 
        rdata[0].buffer = InvalidBuffer;
        rdata[0].data = (char *) &data;
-       rdata[0].len = sizeof(ginxlogSplitData);
+       rdata[0].len = sizeof(ginxlogSplitDataInternal);
        rdata[0].next = &rdata[1];
 
        rdata[1].buffer = InvalidBuffer;
-       rdata[1].data = vector;
-       rdata[1].len = maxoff * sizeofitem;
+       rdata[1].data = (char *) allitems;
+       rdata[1].len = nitems * sizeof(PostingItem);
        rdata[1].next = NULL;
 
-       return lpage;
+       *newlpage = lpage;
+       *newrpage = rpage;
 }
 
 /*
@@ -595,6 +1120,318 @@ ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, Block
        GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
 }
 
+
+/*** Functions to work with disassembled leaf pages ***/
+
+/*
+ * Disassemble page into a disassembledLeaf struct.
+ */
+static disassembledLeaf *
+disassembleLeaf(Page page)
+{
+       disassembledLeaf *leaf;
+       GinPostingList *seg;
+       Pointer         segbegin;
+       Pointer         segend;
+
+       leaf = palloc0(sizeof(disassembledLeaf));
+       dlist_init(&leaf->segments);
+
+       if (GinPageIsCompressed(page))
+       {
+               /*
+                * Create a leafSegment entry for each segment.
+                */
+               seg = GinDataLeafPageGetPostingList(page);
+               segbegin = (Pointer) seg;
+               segend = segbegin + GinDataLeafPageGetPostingListSize(page);
+               while ((Pointer) seg < segend)
+               {
+                       leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
+
+                       seginfo->seg = seg;
+                       seginfo->items = NULL;
+                       seginfo->nitems = 0;
+                       seginfo->modified = false;
+                       dlist_push_tail(&leaf->segments, &seginfo->node);
+
+                       seg = GinNextPostingListSegment(seg);
+               }
+       }
+       else
+       {
+               /*
+                * A pre-9.4 format uncompressed page is represented by a single
+                * segment, with an array of items.
+                */
+               ItemPointer uncompressed;
+               int                     nuncompressed;
+               leafSegmentInfo *seginfo;
+
+               uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed);
+
+               seginfo = palloc(sizeof(leafSegmentInfo));
+
+               seginfo->seg = NULL;
+               seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
+               memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
+               seginfo->nitems = nuncompressed;
+               /* make sure we rewrite this to disk */
+               seginfo->modified = true;
+
+               dlist_push_tail(&leaf->segments, &seginfo->node);
+       }
+
+       return leaf;
+}
+
+/*
+ * Distribute newItems to the segments.
+ *
+ * Any segments that acquire new items are decoded, and the new items are
+ * merged with the old items.
+ *
+ * Returns true if any new items were added. False means they were all
+ * duplicates of existing items on the page.
+ */
+static bool
+addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
+{
+       dlist_iter      iter;
+       ItemPointer nextnew = newItems;
+       int                     newleft = nNewItems;
+       bool            modified = false;
+
+       /*
+        * If the page is completely empty, just construct one new segment to
+        * hold all the new items.
+        */
+       if (dlist_is_empty(&leaf->segments))
+       {
+               /* create a new segment for the new entries */
+               leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
+
+               seginfo->seg = NULL;
+               seginfo->items = newItems;
+               seginfo->nitems = nNewItems;
+               seginfo->modified = true;
+               dlist_push_tail(&leaf->segments, &seginfo->node);
+               return true;
+       }
+
+       dlist_foreach(iter, &leaf->segments)
+       {
+               leafSegmentInfo *cur = (leafSegmentInfo *)  dlist_container(leafSegmentInfo, node, iter.cur);
+               int                     nthis;
+               ItemPointer     tmpitems;
+               int                     ntmpitems;
+
+               /*
+                * How many of the new items fall into this segment?
+                */
+               if (!dlist_has_next(&leaf->segments, iter.cur))
+                       nthis = newleft;
+               else
+               {
+                       leafSegmentInfo *next;
+                       ItemPointerData next_first;
+
+                       next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
+                                                                  dlist_next_node(&leaf->segments, iter.cur));
+                       if (next->items)
+                               next_first = next->items[0];
+                       else
+                       {
+                               Assert(next->seg != NULL);
+                               next_first = next->seg->first;
+                       }
+
+                       nthis = 0;
+                       while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0)
+                               nthis++;
+               }
+               if (nthis == 0)
+                       continue;
+
+               /* Merge the new items with the existing items. */
+               if (!cur->items)
+                       cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
+
+               tmpitems = palloc((cur->nitems + nthis) * sizeof(ItemPointerData));
+               ntmpitems = ginMergeItemPointers(tmpitems,
+                                                                                cur->items, cur->nitems,
+                                                                                nextnew, nthis);
+               if (ntmpitems != cur->nitems)
+               {
+                       cur->items = tmpitems;
+                       cur->nitems = ntmpitems;
+                       cur->seg = NULL;
+                       modified = cur->modified = true;
+               }
+
+               nextnew += nthis;
+               newleft -= nthis;
+               if (newleft == 0)
+                       break;
+       }
+
+       return modified;
+}
+
+/*
+ * Recompresses all segments that have been modified.
+ *
+ * If not all the items fit on two pages (ie. after split), we store as
+ * many items as fit, and set *remaining to the first item that didn't fit.
+ * If all items fit, *remaining is set to invalid.
+ *
+ * Returns true if the page has to be split.
+ *
+ * XXX: Actually, this re-encodes all segments after the first one that was
+ * modified, to make sure the new segments are all more or less of equal
+ * length. That's unnecessarily aggressive; if we've only added a single item
+ * to one segment, for example, we could re-encode just that single segment,
+ * as long as it's still smaller than, say, 2x the normal segment size.
+ */
+static bool
+leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
+{
+       dlist_iter      iter;
+       ItemPointer allitems;
+       int                     nallitems;
+       int                     pgused = 0;
+       bool            needsplit;
+       int                     totalpacked;
+       dlist_mutable_iter miter;
+       dlist_head      recode_list;
+       int                     nrecode;
+       bool            recoding;
+
+       ItemPointerSetInvalid(remaining);
+
+       /*
+        * Find the first segment that needs to be re-coded. Move all segments
+        * that need recoding to separate list, and count the total number of
+        * items in them. Also, add up the number of bytes in unmodified segments
+        * (pgused).
+        */
+       dlist_init(&recode_list);
+       recoding = false;
+       nrecode = 0;
+       pgused = 0;
+       dlist_foreach_modify(miter, &leaf->segments)
+       {
+               leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, miter.cur);
+
+               /* If the segment was modified, re-encode it */
+               if (seginfo->modified || seginfo->seg == NULL)
+                       recoding = true;
+               /*
+                * Also re-encode abnormally small or large segments. (Vacuum can
+                * leave behind small segments, and conversion from pre-9.4 format
+                * can leave behind large segments).
+                */
+               else if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize)
+                       recoding = true;
+               else if (SizeOfGinPostingList(seginfo->seg) > GinPostingListSegmentMaxSize)
+                       recoding = true;
+
+               if (recoding)
+               {
+                       if (!seginfo->items)
+                               seginfo->items = ginPostingListDecode(seginfo->seg,
+                                                                                                         &seginfo->nitems);
+                       nrecode += seginfo->nitems;
+
+                       dlist_delete(miter.cur);
+                       dlist_push_tail(&recode_list, &seginfo->node);
+               }
+               else
+                       pgused += SizeOfGinPostingList(seginfo->seg);
+       }
+
+       if (nrecode == 0)
+               return false; /* nothing to do */
+
+       /*
+        * Construct one big array of the items that need to be re-encoded.
+        */
+       allitems = palloc(nrecode * sizeof(ItemPointerData));
+       nallitems = 0;
+       dlist_foreach(iter, &recode_list)
+       {
+               leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+               memcpy(&allitems[nallitems], seginfo->items, seginfo->nitems * sizeof(ItemPointerData));
+               nallitems += seginfo->nitems;
+       }
+       Assert(nallitems == nrecode);
+
+       /*
+        * Start packing the items into segments. Stop when we have consumed
+        * enough space to fill both pages, or we run out of items.
+        */
+       totalpacked = 0;
+       needsplit = false;
+       while (totalpacked < nallitems)
+       {
+               leafSegmentInfo *seginfo;
+               int                     npacked;
+               GinPostingList *seg;
+               int                     segsize;
+
+               seg = ginCompressPostingList(&allitems[totalpacked],
+                                                                        nallitems - totalpacked,
+                                                                        GinPostingListSegmentMaxSize,
+                                                                        &npacked);
+               segsize = SizeOfGinPostingList(seg);
+               if (pgused + segsize > GinDataLeafMaxContentSize)
+               {
+                       if (!needsplit)
+                       {
+                               /* switch to right page */
+                               Assert(pgused > 0);
+                               leaf->lastleft = dlist_tail_node(&leaf->segments);
+                               needsplit = true;
+                               leaf->lsize = pgused;
+                               pgused = 0;
+                       }
+                       else
+                       {
+                               /* filled both pages */
+                               *remaining = allitems[totalpacked];
+                               break;
+                       }
+               }
+
+               seginfo = palloc(sizeof(leafSegmentInfo));
+               seginfo->seg = seg;
+               seginfo->items = &allitems[totalpacked];
+               seginfo->nitems = npacked;
+               seginfo->modified = true;
+
+               dlist_push_tail(&leaf->segments, &seginfo->node);
+
+               pgused += segsize;
+               totalpacked += npacked;
+       }
+
+       if (!needsplit)
+       {
+               leaf->lsize = pgused;
+               leaf->rsize = 0;
+       }
+       else
+               leaf->rsize = pgused;
+
+       Assert(leaf->lsize <= GinDataLeafMaxContentSize);
+       Assert(leaf->rsize <= GinDataLeafMaxContentSize);
+
+       return needsplit;
+}
+
+
+/*** Functions that are exported to the rest of the GIN code ***/
+
 /*
  * Creates new posting tree containing the given TIDs. Returns the page
  * number of the root of the new posting tree.
@@ -608,10 +1445,9 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
        BlockNumber blkno;
        Buffer          buffer;
        Page            page;
+       Pointer         ptr;
        int                     nrootitems;
-
-       /* Calculate how many TIDs will fit on first page. */
-       nrootitems = Min(nitems, GinMaxLeafDataItems);
+       int                     rootsize;
 
        /*
         * Create the root page.
@@ -622,12 +1458,41 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
 
        START_CRIT_SECTION();
 
-       GinInitBuffer(buffer, GIN_DATA | GIN_LEAF);
-       memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * nrootitems);
-       GinPageGetOpaque(page)->maxoff = nrootitems;
+       GinInitPage(page, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
+       GinPageGetOpaque(page)->rightlink = InvalidBlockNumber;
 
+       /*
+        * Write as many of the items to the root page as fit. In segments
+        * of max GinPostingListSegmentMaxSize bytes each.
+        */
+       nrootitems = 0;
+       rootsize = 0;
+       ptr = (Pointer) GinDataLeafPageGetPostingList(page);
+       while (nrootitems < nitems)
+       {
+               GinPostingList *segment;
+               int                     npacked;
+               int                     segsize;
+
+               segment = ginCompressPostingList(&items[nrootitems],
+                                                                                nitems - nrootitems,
+                                                                                GinPostingListSegmentMaxSize,
+                                                                                &npacked);
+               segsize = SizeOfGinPostingList(segment);
+               if (rootsize + segsize > GinDataLeafMaxContentSize)
+                       break;
+
+               memcpy(ptr, segment, segsize);
+               ptr += segsize;
+               rootsize += segsize;
+               nrootitems += npacked;
+               pfree(segment);
+       }
+       GinDataLeafPageSetPostingListSize(page, rootsize);
        MarkBufferDirty(buffer);
 
+       elog(DEBUG2, "created GIN posting tree with %d items", nrootitems);
+
        if (RelationNeedsWAL(index))
        {
                XLogRecPtr      recptr;
@@ -636,7 +1501,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
 
                data.node = index->rd_node;
                data.blkno = blkno;
-               data.nitem = nrootitems;
+               data.size = rootsize;
 
                rdata[0].buffer = InvalidBuffer;
                rdata[0].data = (char *) &data;
@@ -644,8 +1509,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
                rdata[0].next = &rdata[1];
 
                rdata[1].buffer = InvalidBuffer;
-               rdata[1].data = (char *) items;
-               rdata[1].len = sizeof(ItemPointerData) * nrootitems;
+               rdata[1].data = (char *) GinDataLeafPageGetPostingList(page);
+               rdata[1].len = rootsize;
                rdata[1].next = NULL;
 
                recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata);
@@ -685,10 +1550,9 @@ ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
        btree->findChildPage = dataLocateItem;
        btree->getLeftMostChild = dataGetLeftMostPage;
        btree->isMoveRight = dataIsMoveRight;
-       btree->findItem = dataLocateLeafItem;
+       btree->findItem = NULL;
        btree->findChildPtr = dataFindChildPtr;
        btree->placeToPage = dataPlaceToPage;
-       btree->splitPage = dataSplitPage;
        btree->fillRoot = ginDataFillRoot;
        btree->prepareDownlink = dataPrepareDownlink;
 
@@ -721,17 +1585,7 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
                btree.itemptr = insertdata.items[insertdata.curitem];
                stack = ginFindLeafPage(&btree, false);
 
-               if (btree.findItem(&btree, stack))
-               {
-                       /*
-                        * Current item already exists in index.
-                        */
-                       insertdata.curitem++;
-                       LockBuffer(stack->buffer, GIN_UNLOCK);
-                       freeGinBtreeStack(stack);
-               }
-               else
-                       ginInsertValue(&btree, stack, &insertdata, buildStats);
+               ginInsertValue(&btree, stack, &insertdata, buildStats);
        }
 }
 
index e60c708b01ed3a1839fc3867faf5ad050aef9b2d..6f4e727b3e96c8d6d7463fc3558549a5335f1b7a 100644 (file)
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * ginentrypage.c
- *       page utilities routines for the postgres inverted index access method.
+ *       routines for handling GIN entry tree pages.
  *
  *
  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
 #include "postgres.h"
 
 #include "access/gin_private.h"
+#include "miscadmin.h"
 #include "utils/rel.h"
 
+static void entrySplitPage(GinBtree btree, Buffer origbuf,
+                          GinBtreeStack *stack,
+                          void *insertPayload,
+                          BlockNumber updateblkno, XLogRecData **prdata,
+                          Page *newlpage, Page *newrpage);
+
 /*
  * Form a tuple for entry tree.
  *
  * format that is being built here.  We build on the assumption that we
  * are making a leaf-level key entry containing a posting list of nipd items.
  * If the caller is actually trying to make a posting-tree entry, non-leaf
- * entry, or pending-list entry, it should pass nipd = 0 and then overwrite
- * the t_tid fields as necessary.  In any case, ipd can be NULL to skip
- * copying any itempointers into the posting list; the caller is responsible
- * for filling the posting list afterwards, if ipd = NULL and nipd > 0.
+ * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite
+ * the t_tid fields as necessary.  In any case, 'data' can be NULL to skip
+ * filling in the posting list; the caller is responsible for filling it
+ * afterwards if data = NULL and nipd > 0.
  */
 IndexTuple
 GinFormTuple(GinState *ginstate,
                         OffsetNumber attnum, Datum key, GinNullCategory category,
-                        ItemPointerData *ipd, uint32 nipd,
+                        Pointer data, Size dataSize, int nipd,
                         bool errorTooBig)
 {
        Datum           datums[2];
@@ -80,27 +87,25 @@ GinFormTuple(GinState *ginstate,
                newsize = Max(newsize, minsize);
        }
 
-       newsize = SHORTALIGN(newsize);
-
        GinSetPostingOffset(itup, newsize);
-
        GinSetNPosting(itup, nipd);
 
        /*
         * Add space needed for posting list, if any.  Then check that the tuple
         * won't be too big to store.
         */
-       newsize += sizeof(ItemPointerData) * nipd;
+       newsize += dataSize;
+
        newsize = MAXALIGN(newsize);
-       if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize))
+
+       if (newsize > GinMaxItemSize)
        {
                if (errorTooBig)
                        ereport(ERROR,
                                        (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                        errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
                                   (unsigned long) newsize,
-                                  (unsigned long) Min(INDEX_SIZE_MASK,
-                                                                          GinMaxItemSize),
+                                  (unsigned long) GinMaxItemSize,
                                   RelationGetRelationName(ginstate->index))));
                pfree(itup);
                return NULL;
@@ -119,12 +124,20 @@ GinFormTuple(GinState *ginstate,
                 */
                memset((char *) itup + IndexTupleSize(itup),
                           0, newsize - IndexTupleSize(itup));
-
                /* set new size in tuple header */
                itup->t_info &= ~INDEX_SIZE_MASK;
                itup->t_info |= newsize;
        }
 
+       /*
+        * Copy in the posting list, if provided
+        */
+       if (data)
+       {
+               char *ptr = GinGetPosting(itup);
+               memcpy(ptr, data, dataSize);
+       }
+
        /*
         * Insert category byte, if needed
         */
@@ -133,37 +146,45 @@ GinFormTuple(GinState *ginstate,
                Assert(IndexTupleHasNulls(itup));
                GinSetNullCategory(itup, ginstate, category);
        }
-
-       /*
-        * Copy in the posting list, if provided
-        */
-       if (ipd)
-               memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
-
        return itup;
 }
 
 /*
- * Sometimes we reduce the number of posting list items in a tuple after
- * having built it with GinFormTuple.  This function adjusts the size
- * fields to match.
+ * Read item pointers from leaf entry tuple.
+ *
+ * Returns a palloc'd array of ItemPointers. The number of items is returned
+ * in *nitems.
  */
-void
-GinShortenTuple(IndexTuple itup, uint32 nipd)
+ItemPointer
+ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
+                        int *nitems)
 {
-       uint32          newsize;
-
-       Assert(nipd <= GinGetNPosting(itup));
+       Pointer         ptr = GinGetPosting(itup);
+       int                     nipd = GinGetNPosting(itup);
+       ItemPointer     ipd;
+       int                     ndecoded;
 
-       newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd;
-       newsize = MAXALIGN(newsize);
-
-       Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK));
-
-       itup->t_info &= ~INDEX_SIZE_MASK;
-       itup->t_info |= newsize;
-
-       GinSetNPosting(itup, nipd);
+       if (GinItupIsCompressed(itup))
+       {
+               if (nipd > 0)
+               {
+                       ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded);
+                       if (nipd != ndecoded)
+                               elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded",
+                                        nipd, ndecoded);
+               }
+               else
+               {
+                       ipd = palloc(0);
+               }
+       }
+       else
+       {
+               ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd);
+               memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd);
+       }
+       *nitems = nipd;
+       return ipd;
 }
 
 /*
@@ -492,13 +513,14 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
  * the downlink of the existing item at 'off' is updated to point to
  * 'updateblkno'.
  */
-static bool
-entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
+static GinPlaceToPageRC
+entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
                                 void *insertPayload, BlockNumber updateblkno,
-                                XLogRecData **prdata)
+                                XLogRecData **prdata, Page *newlpage, Page *newrpage)
 {
        GinBtreeEntryInsertData *insertData = insertPayload;
        Page            page = BufferGetPage(buf);
+       OffsetNumber off = stack->off;
        OffsetNumber placed;
        int                     cnt = 0;
 
@@ -508,7 +530,13 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
 
        /* quick exit if it doesn't fit */
        if (!entryIsEnoughSpace(btree, buf, off, insertData))
-               return false;
+       {
+               entrySplitPage(btree, buf, stack, insertPayload, updateblkno,
+                                          prdata, newlpage, newrpage);
+               return SPLIT;
+       }
+
+       START_CRIT_SECTION();
 
        *prdata = rdata;
        entryPreparePage(btree, page, off, insertData, updateblkno);
@@ -522,6 +550,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
                         RelationGetRelationName(btree->index));
 
        data.isDelete = insertData->isDelete;
+       data.offset = off;
 
        rdata[cnt].buffer = buf;
        rdata[cnt].buffer_std = false;
@@ -536,21 +565,24 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off,
        rdata[cnt].len = IndexTupleSize(insertData->entry);
        rdata[cnt].next = NULL;
 
-       return true;
+       return INSERTED;
 }
 
 /*
  * Place tuple and split page, original buffer(lbuf) leaves untouched,
- * returns shadow page of lbuf filled new data.
+ * returns shadow pages filled with new data.
  * Tuples are distributed between pages by equal size on its, not
  * an equal number!
  */
-static Page
-entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
+static void
+entrySplitPage(GinBtree btree, Buffer origbuf,
+                          GinBtreeStack *stack,
                           void *insertPayload,
-                          BlockNumber updateblkno, XLogRecData **prdata)
+                          BlockNumber updateblkno, XLogRecData **prdata,
+                          Page *newlpage, Page *newrpage)
 {
        GinBtreeEntryInsertData *insertData = insertPayload;
+       OffsetNumber off = stack->off;
        OffsetNumber i,
                                maxoff,
                                separator = InvalidOffsetNumber;
@@ -561,8 +593,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
        char       *ptr;
        IndexTuple      itup;
        Page            page;
-       Page            lpage = PageGetTempPageCopy(BufferGetPage(lbuf));
-       Page            rpage = BufferGetPage(rbuf);
+       Page            lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
+       Page            rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
        Size            pageSize = PageGetPageSize(lpage);
 
        /* these must be static so they can be returned to caller */
@@ -651,7 +683,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off,
        rdata[1].len = tupstoresize;
        rdata[1].next = NULL;
 
-       return lpage;
+       *newlpage = lpage;
+       *newrpage = rpage;
 }
 
 /*
@@ -719,7 +752,6 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
        btree->findItem = entryLocateLeafEntry;
        btree->findChildPtr = entryFindChildPtr;
        btree->placeToPage = entryPlaceToPage;
-       btree->splitPage = entrySplitPage;
        btree->fillRoot = ginEntryFillRoot;
        btree->prepareDownlink = entryPrepareDownlink;
 
index 35e1d7114b76e143508cae4dd1f0c6a4f0acb859..4a65046547ef88bdb5563aef2925be6c8f3fa15a 100644 (file)
@@ -487,7 +487,7 @@ ginHeapTupleFastCollect(GinState *ginstate,
                IndexTuple      itup;
 
                itup = GinFormTuple(ginstate, attnum, entries[i], categories[i],
-                                                       NULL, 0, true);
+                                                       NULL, 0, 0, true);
                itup->t_tid = *ht_ctid;
                collector->tuples[collector->ntuples++] = itup;
                collector->sumsize += IndexTupleSize(itup);
index 3d99bef76a3a67e1277ffee18bdb144c96b97185..4bdbd4525d53205aa1367cf480764e832048d9e7 100644 (file)
@@ -71,24 +71,20 @@ callConsistentFn(GinState *ginstate, GinScanKey key)
  * Tries to refind previously taken ItemPointer on a posting page.
  */
 static bool
-findItemInPostingPage(Page page, ItemPointer item, OffsetNumber *off)
+needToStepRight(Page page, ItemPointer item)
 {
-       OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
-       int                     res;
-
        if (GinPageGetOpaque(page)->flags & GIN_DELETED)
                /* page was deleted by concurrent vacuum */
-               return false;
+               return true;
 
-       /*
-        * scan page to find equal or first greater value
-        */
-       for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++)
+       if (ginCompareItemPointers(item, GinDataPageGetRightBound(page)) > 0
+                       && !GinPageRightMost(page))
        {
-               res = ginCompareItemPointers(item, GinDataPageGetItemPointer(page, *off));
-
-               if (res <= 0)
-                       return true;
+               /*
+                * the item we're looking is > the right bound of the page, so it
+                * can't be on this page.
+                */
+               return true;
        }
 
        return false;
@@ -143,14 +139,10 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
        for (;;)
        {
                page = BufferGetPage(buffer);
-
-               if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 &&
-                       GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber)
+               if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
                {
-                       tbm_add_tuples(scanEntry->matchBitmap,
-                                                  GinDataPageGetItemPointer(page, FirstOffsetNumber),
-                                                  GinPageGetOpaque(page)->maxoff, false);
-                       scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff;
+                       int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
+                       scanEntry->predictNumberResult += n;
                }
 
                if (GinPageRightMost(page))
@@ -335,8 +327,11 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
                }
                else
                {
-                       tbm_add_tuples(scanEntry->matchBitmap,
-                                                  GinGetPosting(itup), GinGetNPosting(itup), false);
+                       ItemPointer ipd;
+                       int                     nipd;
+
+                       ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
+                       tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
                        scanEntry->predictNumberResult += GinGetNPosting(itup);
                }
 
@@ -450,16 +445,14 @@ restartScanEntry:
                        IncrBufferRefCount(entry->buffer);
 
                        page = BufferGetPage(entry->buffer);
-                       entry->predictNumberResult = stack->predictNumber * GinPageGetOpaque(page)->maxoff;
 
                        /*
-                        * Keep page content in memory to prevent durable page locking
+                        * Copy page content to memory to avoid keeping it locked for
+                        * a long time.
                         */
-                       entry->list = (ItemPointerData *) palloc(BLCKSZ);
-                       entry->nlist = GinPageGetOpaque(page)->maxoff;
-                       memcpy(entry->list,
-                                  GinDataPageGetItemPointer(page, FirstOffsetNumber),
-                                  GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData));
+                       entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
+
+                       entry->predictNumberResult = stack->predictNumber * entry->nlist;
 
                        LockBuffer(entry->buffer, GIN_UNLOCK);
                        freeGinBtreeStack(stack);
@@ -467,9 +460,10 @@ restartScanEntry:
                }
                else if (GinGetNPosting(itup) > 0)
                {
-                       entry->nlist = GinGetNPosting(itup);
-                       entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist);
-                       memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist);
+                       entry->list = ginReadTuple(ginstate, entry->attnum, itup,
+                               &entry->nlist);
+                       entry->predictNumberResult = entry->nlist;
+
                        entry->isFinished = FALSE;
                }
        }
@@ -532,6 +526,7 @@ static void
 entryGetNextItem(GinState *ginstate, GinScanEntry entry)
 {
        Page            page;
+       int                     i;
 
        for (;;)
        {
@@ -564,35 +559,47 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
                        page = BufferGetPage(entry->buffer);
 
                        entry->offset = InvalidOffsetNumber;
-                       if (!ItemPointerIsValid(&entry->curItem) ||
-                               findItemInPostingPage(page, &entry->curItem, &entry->offset))
+                       if (entry->list)
                        {
-                               /*
-                                * Found position equal to or greater than stored
-                                */
-                               entry->nlist = GinPageGetOpaque(page)->maxoff;
-                               memcpy(entry->list,
-                                          GinDataPageGetItemPointer(page, FirstOffsetNumber),
-                                          GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData));
+                               pfree(entry->list);
+                               entry->list = NULL;
+                       }
 
-                               LockBuffer(entry->buffer, GIN_UNLOCK);
+                       /*
+                        * If the page was concurrently split, we have to re-find the
+                        * item we were stopped on. If the page was split more than once,
+                        * the item might not be on this page, but somewhere to the right.
+                        * Keep following the right-links until we re-find the correct
+                        * page.
+                        */
+                       if (ItemPointerIsValid(&entry->curItem) &&
+                               needToStepRight(page, &entry->curItem))
+                       {
+                               continue;
+                       }
+
+                       entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
 
-                               if (!ItemPointerIsValid(&entry->curItem) ||
-                                       ginCompareItemPointers(&entry->curItem,
-                                                                          entry->list + entry->offset - 1) == 0)
+                       /* re-find the item we were stopped on. */
+                       if (ItemPointerIsValid(&entry->curItem))
+                       {
+                               for (i = 0; i < entry->nlist; i++)
                                {
-                                       /*
-                                        * First pages are deleted or empty, or we found exact
-                                        * position, so break inner loop and continue outer one.
-                                        */
-                                       break;
+                                       if (ginCompareItemPointers(&entry->curItem,
+                                                                                          &entry->list[i]) < 0)
+                                       {
+                                               LockBuffer(entry->buffer, GIN_UNLOCK);
+                                               entry->offset = i + 1;
+                                               entry->curItem = entry->list[entry->offset - 1];
+                                               return;
+                                       }
                                }
-
-                               /*
-                                * Find greater than entry->curItem position, store it.
-                                */
+                       }
+                       else
+                       {
+                               LockBuffer(entry->buffer, GIN_UNLOCK);
+                               entry->offset = 1; /* scan all items on the page. */
                                entry->curItem = entry->list[entry->offset - 1];
-
                                return;
                        }
                }
index 5e6f627a76be2b9e5a7adab9649084866d48346a..cd9c7c120b334528e7ecbad76b384b893c85f66c 100644 (file)
@@ -53,31 +53,42 @@ addItemPointersToLeafTuple(GinState *ginstate,
        Datum           key;
        GinNullCategory category;
        IndexTuple      res;
+       ItemPointerData *newItems,
+                          *oldItems;
+       int                     oldNPosting,
+                               newNPosting;
+       GinPostingList *compressedList;
 
        Assert(!GinIsPostingTree(old));
 
        attnum = gintuple_get_attrnum(ginstate, old);
        key = gintuple_get_key(ginstate, old, &category);
 
-       /* try to build tuple with room for all the items */
-       res = GinFormTuple(ginstate, attnum, key, category,
-                                          NULL, nitem + GinGetNPosting(old),
-                                          false);
+       /* merge the old and new posting lists */
+       oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting);
 
-       if (res)
+       newNPosting = oldNPosting + nitem;
+       newItems = (ItemPointerData *) palloc(sizeof(ItemPointerData) * newNPosting);
+
+       newNPosting = ginMergeItemPointers(newItems,
+                                                                          items, nitem,
+                                                                          oldItems, oldNPosting);
+
+       /* Compress the posting list, and try to a build tuple with room for it */
+       res = NULL;
+       compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize,
+                                                                                       NULL);
+       pfree(newItems);
+       if (compressedList)
        {
-               /* good, small enough */
-               uint32          newnitem;
-
-               /* fill in the posting list with union of old and new TIDs */
-               newnitem = ginMergeItemPointers(GinGetPosting(res),
-                                                                               GinGetPosting(old),
-                                                                               GinGetNPosting(old),
-                                                                               items, nitem);
-               /* merge might have eliminated some duplicate items */
-               GinShortenTuple(res, newnitem);
+               res = GinFormTuple(ginstate, attnum, key, category,
+                                                  (char *) compressedList,
+                                                  SizeOfGinPostingList(compressedList),
+                                                  newNPosting,
+                                                  false);
+               pfree(compressedList);
        }
-       else
+       if (!res)
        {
                /* posting list would be too big, convert to posting tree */
                BlockNumber postingRoot;
@@ -88,8 +99,8 @@ addItemPointersToLeafTuple(GinState *ginstate,
                 * already be in order with no duplicates.
                 */
                postingRoot = createPostingTree(ginstate->index,
-                                                                               GinGetPosting(old),
-                                                                               GinGetNPosting(old),
+                                                                               oldItems,
+                                                                               oldNPosting,
                                                                                buildStats);
 
                /* Now insert the TIDs-to-be-added into the posting tree */
@@ -98,9 +109,10 @@ addItemPointersToLeafTuple(GinState *ginstate,
                                                          buildStats);
 
                /* And build a new posting-tree-only result tuple */
-               res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true);
+               res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
                GinSetPostingTree(res, postingRoot);
        }
+       pfree(oldItems);
 
        return res;
 }
@@ -119,12 +131,19 @@ buildFreshLeafTuple(GinState *ginstate,
                                        ItemPointerData *items, uint32 nitem,
                                        GinStatsData *buildStats)
 {
-       IndexTuple      res;
+       IndexTuple      res = NULL;
+       GinPostingList *compressedList;
 
        /* try to build a posting list tuple with all the items */
-       res = GinFormTuple(ginstate, attnum, key, category,
-                                          items, nitem, false);
-
+       compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, NULL);
+       if (compressedList)
+       {
+               res = GinFormTuple(ginstate, attnum, key, category,
+                                                  (char *) compressedList,
+                                                  SizeOfGinPostingList(compressedList),
+                                                  nitem, false);
+               pfree(compressedList);
+       }
        if (!res)
        {
                /* posting list would be too big, build posting tree */
@@ -134,7 +153,7 @@ buildFreshLeafTuple(GinState *ginstate,
                 * Build posting-tree-only result tuple.  We do this first so as to
                 * fail quickly if the key is too big.
                 */
-               res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true);
+               res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
 
                /*
                 * Initialize a new posting tree with the TIDs.
index 840fa7724c9ec2e62c3967f1720fc6d540867fc7..f532a3dbf032269c5d7dc512686b5a5228c79d62 100644 (file)
 
 #include "access/gin_private.h"
 
+#ifdef USE_ASSERT_CHECKING
+#define CHECK_ENCODING_ROUNDTRIP
+#endif
+
+/*
+ * For encoding purposes, item pointers are represented as 64-bit unsigned
+ * integers. The lowest 11 bits represent the offset number, and the next
+ * lowest 32 bits are the block number. That leaves 17 bits unused, ie.
+ * only 43 low bits are used.
+ *
+ * These 43-bit integers are encoded using varbyte encoding. In each byte,
+ * the 7 low bits contain data, while the highest bit is a continuation bit.
+ * When the continuation bit is set, the next byte is part of the same
+ * integer, otherwise this is the last byte of this integer.  43 bits fit
+ * conveniently in at most 6 bytes when varbyte encoded (the 6th byte does
+ * not need a continuation bit, because we know the max size to be 43 bits):
+ *
+ * 0XXXXXXX
+ * 1XXXXXXX 0XXXXYYY
+ * 1XXXXXXX 1XXXXYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
+ * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY
+ *
+ * X = bits used for offset number
+ * Y = bits used for block number
+ *
+ * The bytes are in stored in little-endian order.
+ *
+ * An important property of this encoding is that removing an item from list
+ * never increases the size of the resulting compressed posting list. Proof:
+ *
+ * Removing number is actually replacement of two numbers with their sum. We
+ * have to prove that varbyte encoding of a sum can't be longer than varbyte
+ * encoding of its summands. Sum of two numbers is at most one bit wider than
+ * than the larger of the summands. Widening a number by one bit enlarges its
+ * length in varbyte encoding by at most one byte. Therefore, varbyte encoding
+ * of sum is at most one byte longer than varbyte encoding of larger summand.
+ * Lesser summand is at least one byte, so the sum cannot take more space than
+ * the summands, Q.E.D.
+ *
+ * This property greatly simplifies VACUUM, which can assume that posting
+ * lists always fit on the same page after vacuuming. Note that even though
+ * that holds for removing items from a posting list, you must also be
+ * careful to not cause expansion e.g when merging uncompressed items on the
+ * page into the compressed lists, when vacuuming.
+ */
+
+/*
+ * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
+ * integer, but you can't fit that many items on a page. 11 ought to be more
+ * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
+ * use the minimum number of bits, but that would require changing the on-disk
+ * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
+ */
+#define MaxHeapTuplesPerPageBits               11
+
+static inline uint64
+itemptr_to_uint64(const ItemPointer iptr)
+{
+       uint64          val;
+
+       Assert(ItemPointerIsValid(iptr));
+       Assert(iptr->ip_posid < (1 << MaxHeapTuplesPerPageBits));
+
+       val = iptr->ip_blkid.bi_hi;
+       val <<= 16;
+       val |= iptr->ip_blkid.bi_lo;
+       val <<= MaxHeapTuplesPerPageBits;
+       val |= iptr->ip_posid;
+
+       return val;
+}
+
+static inline void
+uint64_to_itemptr(uint64 val, ItemPointer iptr)
+{
+       iptr->ip_posid = val & ((1 << MaxHeapTuplesPerPageBits) - 1);
+       val = val >> MaxHeapTuplesPerPageBits;
+       iptr->ip_blkid.bi_lo = val & 0xFFFF;
+       val = val >> 16;
+       iptr->ip_blkid.bi_hi = val & 0xFFFF;
+
+       Assert(ItemPointerIsValid(iptr));
+}
+
+/*
+ * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
+ */
+static void
+encode_varbyte(uint64 val, unsigned char **ptr)
+{
+       unsigned char *p = *ptr;
+
+       while (val > 0x7F)
+       {
+               *(p++) = 0x80 | (val & 0x7F);
+               val >>= 7;
+       }
+       *(p++) = (unsigned char) val;
+
+       *ptr = p;
+}
+
+/*
+ * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
+ */
+static uint64
+decode_varbyte(unsigned char **ptr)
+{
+       uint64 val;
+       unsigned char *p = *ptr;
+       uint64 c;
+
+       c = *(p++);
+       val = c & 0x7F;
+       if (c & 0x80)
+       {
+               c = *(p++);
+               val |= (c & 0x7F) << 7;
+               if (c & 0x80)
+               {
+                       c = *(p++);
+                       val |= (c & 0x7F) << 14;
+                       if (c & 0x80)
+                       {
+                               c = *(p++);
+                               val |= (c & 0x7F) << 21;
+                               if (c & 0x80)
+                               {
+                                       c = *(p++);
+                                       val |= (c & 0x7F) << 28;
+                                       if (c & 0x80)
+                                       {
+                                               c = *(p++);
+                                               val |= (c & 0x7F) << 35;
+                                               if (c & 0x80)
+                                               {
+                                                       /* last byte, no continuation bit */
+                                                       c = *(p++);
+                                                       val |= c << 42;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       *ptr = p;
+
+       return val;
+}
+
+/*
+ * Encode a posting list.
+ *
+ * The encoded list is returned in a palloc'd struct, which will be at most
+ * 'maxsize' bytes in size.  The number items in the returned segment is
+ * returned in *nwritten. If it's not equal to nipd, not all the items fit
+ * in 'maxsize', and only the first *nwritten were encoded.
+ */
+GinPostingList *
+ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
+                                          int *nwritten)
+{
+       uint64          prev;
+       int                     totalpacked = 0;
+       int                     maxbytes;
+       GinPostingList *result;
+       unsigned char *ptr;
+       unsigned char *endptr;
+
+       result = palloc(maxsize);
+
+       maxbytes = maxsize - offsetof(GinPostingList, bytes);
+
+       /* Store the first special item */
+       result->first = ipd[0];
+
+       prev = itemptr_to_uint64(&result->first);
+
+       ptr = result->bytes;
+       endptr = result->bytes + maxbytes;
+       for (totalpacked = 1; totalpacked < nipd; totalpacked++)
+       {
+               uint64          val = itemptr_to_uint64(&ipd[totalpacked]);
+               uint64          delta = val - prev;
+
+               Assert (val > prev);
+
+               if (endptr - ptr >= 6)
+                       encode_varbyte(delta, &ptr);
+               else
+               {
+                       /*
+                        * There are less than 6 bytes left. Have to check if the next
+                        * item fits in that space before writing it out.
+                        */
+                       unsigned char buf[6];
+                       unsigned char *p = buf;
+
+                       encode_varbyte(delta, &p);
+                       if (p - buf > (endptr - ptr))
+                               break; /* output is full */
+
+                       memcpy(ptr, buf, p - buf);
+                       ptr += (p - buf);
+               }
+               prev = val;
+       }
+       result->nbytes = ptr - result->bytes;
+
+       if (nwritten)
+               *nwritten = totalpacked;
+
+       Assert(SizeOfGinPostingList(result) <= maxsize);
+
+       /*
+        * Check that the encoded segment decodes back to the original items.
+        */
+#if defined (CHECK_ENCODING_ROUNDTRIP)
+       if (assert_enabled)
+       {
+               int                     ndecoded;
+               ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
+               int                     i;
+
+               Assert(ndecoded == totalpacked);
+               for (i = 0; i < ndecoded; i++)
+                       Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0);
+               pfree(tmp);
+       }
+#endif
+
+       return result;
+}
+
+/*
+ * Decode a compressed posting list into an array of item pointers.
+ * The number of items is returned in *ndecoded.
+ */
+ItemPointer
+ginPostingListDecode(GinPostingList *plist, int *ndecoded)
+{
+       return ginPostingListDecodeAllSegments(plist,
+                                                                                  SizeOfGinPostingList(plist),
+                                                                                  ndecoded);
+}
+
+/*
+ * Decode multiple posting list segments into an array of item pointers.
+ * The number of items is returned in *ndecoded_out. The segments are stored
+ * one after each other, with total size 'len' bytes.
+ */
+ItemPointer
+ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
+{
+       ItemPointer     result;
+       int                     nallocated;
+       uint64          val;
+       char       *endseg = ((char *) segment) + len;
+       int                     ndecoded;
+       unsigned char *ptr;
+       unsigned char *endptr;
+
+       /*
+        * Guess an initial size of the array.
+        */
+       nallocated = segment->nbytes * 2 + 1;
+       result = palloc(nallocated * sizeof(ItemPointerData));
+
+       ndecoded = 0;
+       while ((char *) segment < endseg)
+       {
+               /* enlarge output array if needed */
+               if (ndecoded >= nallocated)
+               {
+                       nallocated *= 2;
+                       result = repalloc(result, nallocated * sizeof(ItemPointerData));
+               }
+
+               /* copy the first item */
+               result[ndecoded] = segment->first;
+               ndecoded++;
+               Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
+
+               val = itemptr_to_uint64(&segment->first);
+               ptr = segment->bytes;
+               endptr = segment->bytes + segment->nbytes;
+               while (ptr < endptr)
+               {
+                       /* enlarge output array if needed */
+                       if (ndecoded >= nallocated)
+                       {
+                               nallocated *= 2;
+                               result = repalloc(result, nallocated * sizeof(ItemPointerData));
+                       }
+
+                       val += decode_varbyte(&ptr);
+
+                       uint64_to_itemptr(val, &result[ndecoded]);
+                       ndecoded++;
+               }
+               segment = GinNextPostingListSegment(segment);
+       }
+
+       if (ndecoded_out)
+               *ndecoded_out = ndecoded;
+       return result;
+}
+
+/*
+ * Add all item pointers from a bunch of posting lists to a TIDBitmap.
+ */
+int
+ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
+                                                                        TIDBitmap *tbm)
+{
+       int                     ndecoded;
+       ItemPointer     items;
+
+       items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
+       tbm_add_tuples(tbm, items, ndecoded, false);
+       pfree(items);
+
+       return ndecoded;
+}
+
 /*
  * Merge two ordered arrays of itempointers, eliminating any duplicates.
  * Returns the number of items in the result.
  * Caller is responsible that there is enough space at *dst.
+ *
+ * It's OK if 'dst' overlaps with the *beginning* of one of the arguments.
  */
-uint32
+int
 ginMergeItemPointers(ItemPointerData *dst,
                                         ItemPointerData *a, uint32 na,
                                         ItemPointerData *b, uint32 nb)
@@ -29,28 +359,50 @@ ginMergeItemPointers(ItemPointerData *dst,
        ItemPointerData *dptr = dst;
        ItemPointerData *aptr = a,
                           *bptr = b;
+       int                     result;
 
-       while (aptr - a < na && bptr - b < nb)
+       /*
+        * If the argument arrays don't overlap, we can just append them to
+        * each other.
+        */
+       if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
        {
-               int                     cmp = ginCompareItemPointers(aptr, bptr);
-
-               if (cmp > 0)
-                       *dptr++ = *bptr++;
-               else if (cmp == 0)
+               memmove(dst, a, na * sizeof(ItemPointerData));
+               memmove(&dst[na], b, nb * sizeof(ItemPointerData));
+               result = na + nb;
+       }
+       else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
+       {
+               memmove(dst, b, nb * sizeof(ItemPointerData));
+               memmove(&dst[nb], a, na * sizeof(ItemPointerData));
+               result = na + nb;
+       }
+       else
+       {
+               while (aptr - a < na && bptr - b < nb)
                {
-                       /* we want only one copy of the identical items */
-                       *dptr++ = *bptr++;
-                       aptr++;
+                       int                     cmp = ginCompareItemPointers(aptr, bptr);
+
+                       if (cmp > 0)
+                               *dptr++ = *bptr++;
+                       else if (cmp == 0)
+                       {
+                               /* only keep one copy of the identical items */
+                               *dptr++ = *bptr++;
+                               aptr++;
+                       }
+                       else
+                               *dptr++ = *aptr++;
                }
-               else
+
+               while (aptr - a < na)
                        *dptr++ = *aptr++;
-       }
 
-       while (aptr - a < na)
-               *dptr++ = *aptr++;
+               while (bptr - b < nb)
+                       *dptr++ = *bptr++;
 
-       while (bptr - b < nb)
-               *dptr++ = *bptr++;
+               result = dptr - dst;
+       }
 
-       return dptr - dst;
+       return result;
 }
index 4212effbe46742ac04e5aee7101cb41b58d2e015..6bf65c32935472aa0934c49cb5c3cda6e68fd56b 100644 (file)
@@ -20,8 +20,9 @@
 #include "postmaster/autovacuum.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/memutils.h"
 
-typedef struct
+typedef struct GinVacuumState
 {
        Relation        index;
        IndexBulkDeleteResult *result;
@@ -29,56 +30,58 @@ typedef struct
        void       *callback_state;
        GinState        ginstate;
        BufferAccessStrategy strategy;
+       MemoryContext tmpCxt;
 } GinVacuumState;
 
-
 /*
- * Vacuums a list of item pointers. The original size of the list is 'nitem',
- * returns the number of items remaining afterwards.
+ * Vacuums an uncompressed posting list. The size of the must can be specified
+ * in number of items (nitems).
  *
- * If *cleaned == NULL on entry, the original array is left unmodified; if
- * any items are removed, a palloc'd copy of the result is stored in *cleaned.
- * Otherwise *cleaned should point to the original array, in which case it's
- * modified directly.
+ * If none of the items need to be removed, returns NULL. Otherwise returns
+ * a new palloc'd array with the remaining items. The number of remaining
+ * items is returned in *nremaining.
  */
-static int
-ginVacuumPostingList(GinVacuumState *gvs, ItemPointerData *items, int nitem,
-                                        ItemPointerData **cleaned)
+ItemPointer
+ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items,
+                                         int nitem, int *nremaining)
 {
        int                     i,
-                               j = 0;
-
-       Assert(*cleaned == NULL || *cleaned == items);
+                               remaining = 0;
+       ItemPointer     tmpitems = NULL;
 
        /*
-        * just scan over ItemPointer array
+        * Iterate over TIDs array
         */
        for (i = 0; i < nitem; i++)
        {
                if (gvs->callback(items + i, gvs->callback_state))
                {
                        gvs->result->tuples_removed += 1;
-                       if (!*cleaned)
+                       if (!tmpitems)
                        {
-                               *cleaned = (ItemPointerData *) palloc(sizeof(ItemPointerData) * nitem);
-                               if (i != 0)
-                                       memcpy(*cleaned, items, sizeof(ItemPointerData) * i);
+                               /*
+                                * First TID to be deleted: allocate memory to hold the
+                                * remaining items.
+                                */
+                               tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+                               memcpy(tmpitems, items, sizeof(ItemPointerData) * i);
                        }
                }
                else
                {
                        gvs->result->num_index_tuples += 1;
-                       if (i != j)
-                               (*cleaned)[j] = items[i];
-                       j++;
+                       if (tmpitems)
+                               tmpitems[remaining] = items[i];
+                       remaining++;
                }
        }
 
-       return j;
+       *nremaining = remaining;
+       return tmpitems;
 }
 
 /*
- * fills WAL record for vacuum leaf page
+ * Create a WAL record for vacuuming entry tree leaf page.
  */
 static void
 xlogVacuumPage(Relation index, Buffer buffer)
@@ -86,65 +89,64 @@ xlogVacuumPage(Relation index, Buffer buffer)
        Page            page = BufferGetPage(buffer);
        XLogRecPtr      recptr;
        XLogRecData rdata[3];
-       ginxlogVacuumPage data;
-       char       *backup;
-       char            itups[BLCKSZ];
-       uint32          len = 0;
+       ginxlogVacuumPage xlrec;
+       uint16          lower;
+       uint16          upper;
 
+       /* This is only used for entry tree leaf pages. */
+       Assert(!GinPageIsData(page));
        Assert(GinPageIsLeaf(page));
 
        if (!RelationNeedsWAL(index))
                return;
 
-       data.node = index->rd_node;
-       data.blkno = BufferGetBlockNumber(buffer);
+       xlrec.node = index->rd_node;
+       xlrec.blkno = BufferGetBlockNumber(buffer);
+
+       /* Assume we can omit data between pd_lower and pd_upper */
+       lower = ((PageHeader) page)->pd_lower;
+       upper = ((PageHeader) page)->pd_upper;
+
+       Assert(lower < BLCKSZ);
+       Assert(upper < BLCKSZ);
 
-       if (GinPageIsData(page))
+       if (lower >= SizeOfPageHeaderData &&
+               upper > lower &&
+               upper <= BLCKSZ)
        {
-               backup = GinDataPageGetData(page);
-               data.nitem = GinPageGetOpaque(page)->maxoff;
-               if (data.nitem)
-                       len = MAXALIGN(sizeof(ItemPointerData) * data.nitem);
+               xlrec.hole_offset = lower;
+               xlrec.hole_length = upper - lower;
        }
        else
        {
-               char       *ptr;
-               OffsetNumber i;
-
-               ptr = backup = itups;
-               for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++)
-               {
-                       IndexTuple      itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
-
-                       memcpy(ptr, itup, IndexTupleSize(itup));
-                       ptr += MAXALIGN(IndexTupleSize(itup));
-               }
-
-               data.nitem = PageGetMaxOffsetNumber(page);
-               len = ptr - backup;
+               /* No "hole" to compress out */
+               xlrec.hole_offset = 0;
+               xlrec.hole_length = 0;
        }
 
-       rdata[0].buffer = buffer;
-       rdata[0].buffer_std = (GinPageIsData(page)) ? FALSE : TRUE;
-       rdata[0].len = 0;
-       rdata[0].data = NULL;
-       rdata[0].next = rdata + 1;
+       rdata[0].data = (char *) &xlrec;
+       rdata[0].len = sizeof(ginxlogVacuumPage);
+       rdata[0].buffer = InvalidBuffer;
+       rdata[0].next = &rdata[1];
 
-       rdata[1].buffer = InvalidBuffer;
-       rdata[1].len = sizeof(ginxlogVacuumPage);
-       rdata[1].data = (char *) &data;
-
-       if (len == 0)
+       if (xlrec.hole_length == 0)
        {
+               rdata[1].data = (char *) page;
+               rdata[1].len = BLCKSZ;
+               rdata[1].buffer = InvalidBuffer;
                rdata[1].next = NULL;
        }
        else
        {
-               rdata[1].next = rdata + 2;
-
+               /* must skip the hole */
+               rdata[1].data = (char *) page;
+               rdata[1].len = xlrec.hole_offset;
+               rdata[1].buffer = InvalidBuffer;
+               rdata[1].next = &rdata[2];
+
+               rdata[2].data = (char *) page + (xlrec.hole_offset + xlrec.hole_length);
+               rdata[2].len = BLCKSZ - (xlrec.hole_offset + xlrec.hole_length);
                rdata[2].buffer = InvalidBuffer;
-               rdata[2].len = len;
-               rdata[2].data = backup;
                rdata[2].next = NULL;
        }
 
@@ -158,6 +160,7 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
        Buffer          buffer;
        Page            page;
        bool            hasVoidPage = FALSE;
+       MemoryContext oldCxt;
 
        buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
                                                                RBM_NORMAL, gvs->strategy);
@@ -169,7 +172,6 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
         * again). New scan can't start but previously started ones work
         * concurrently.
         */
-
        if (isRoot)
                LockBufferForCleanup(buffer);
        else
@@ -179,32 +181,14 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
 
        if (GinPageIsLeaf(page))
        {
-               OffsetNumber newMaxOff,
-                                       oldMaxOff = GinPageGetOpaque(page)->maxoff;
-               ItemPointerData *cleaned = NULL;
-
-               newMaxOff = ginVacuumPostingList(gvs,
-                               (ItemPointer) GinDataPageGetData(page), oldMaxOff, &cleaned);
-
-               /* saves changes about deleted tuple ... */
-               if (oldMaxOff != newMaxOff)
-               {
-                       START_CRIT_SECTION();
-
-                       if (newMaxOff > 0)
-                               memcpy(GinDataPageGetData(page), cleaned, sizeof(ItemPointerData) * newMaxOff);
-                       pfree(cleaned);
-                       GinPageGetOpaque(page)->maxoff = newMaxOff;
+               oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
+               ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
+               MemoryContextSwitchTo(oldCxt);
+               MemoryContextReset(gvs->tmpCxt);
 
-                       MarkBufferDirty(buffer);
-                       xlogVacuumPage(gvs->index, buffer);
-
-                       END_CRIT_SECTION();
-
-                       /* if root is a leaf page, we don't desire further processing */
-                       if (!isRoot && GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
-                               hasVoidPage = TRUE;
-               }
+               /* if root is a leaf page, we don't desire further processing */
+               if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page))
+                       hasVoidPage = TRUE;
        }
        else
        {
@@ -224,7 +208,7 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
        }
 
        /*
-        * if we have root and theres void pages in tree, then we don't release
+        * if we have root and there are empty pages in tree, then we don't release
         * lock to go further processing and guarantee that tree is unused
         */
        if (!(isRoot && hasVoidPage))
@@ -391,6 +375,7 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
        Buffer          buffer;
        Page            page;
        bool            meDelete = FALSE;
+       bool            isempty;
 
        if (isRoot)
        {
@@ -429,7 +414,12 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
                }
        }
 
-       if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)
+       if (GinPageIsLeaf(page))
+               isempty = GinDataLeafPageIsEmpty(page);
+       else
+               isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber;
+
+       if (isempty)
        {
                /* we never delete the left- or rightmost branch */
                if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page))
@@ -513,22 +503,47 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
                }
                else if (GinGetNPosting(itup) > 0)
                {
+                       int                     nitems;
+                       ItemPointer uncompressed;
+
                        /*
-                        * if we already created a temporary page, make changes in place
+                        * Vacuum posting list with proper function for compressed and
+                        * uncompressed format.
                         */
-                       ItemPointerData *cleaned = (tmppage == origpage) ? NULL : GinGetPosting(itup);
-                       int                     newN;
-
-                       newN = ginVacuumPostingList(gvs, GinGetPosting(itup), GinGetNPosting(itup), &cleaned);
+                       if (GinItupIsCompressed(itup))
+                               uncompressed = ginPostingListDecode((GinPostingList *) GinGetPosting(itup), &nitems);
+                       else
+                       {
+                               uncompressed = (ItemPointer) GinGetPosting(itup);
+                               nitems = GinGetNPosting(itup);
+                       }
 
-                       if (GinGetNPosting(itup) != newN)
+                       uncompressed = ginVacuumItemPointers(gvs, uncompressed, nitems,
+                                                                                                &nitems);
+                       if (uncompressed)
                        {
+                               /*
+                                * Some ItemPointers were deleted, recreate tuple.
+                                */
                                OffsetNumber attnum;
                                Datum           key;
                                GinNullCategory category;
+                               GinPostingList *plist;
+                               int                     plistsize;
+
+                               if (nitems > 0)
+                               {
+                                       plist = ginCompressPostingList(uncompressed, nitems, GinMaxItemSize, NULL);
+                                       plistsize = SizeOfGinPostingList(plist);
+                               }
+                               else
+                               {
+                                       plist = NULL;
+                                       plistsize = 0;
+                               }
 
                                /*
-                                * Some ItemPointers were deleted, recreate tuple.
+                                * if we already created a temporary page, make changes in place
                                 */
                                if (tmppage == origpage)
                                {
@@ -538,15 +553,6 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
                                         */
                                        tmppage = PageGetTempPageCopy(origpage);
 
-                                       if (newN > 0)
-                                       {
-                                               Size            pos = ((char *) GinGetPosting(itup)) - ((char *) origpage);
-
-                                               memcpy(tmppage + pos, cleaned, sizeof(ItemPointerData) * newN);
-                                       }
-
-                                       pfree(cleaned);
-
                                        /* set itup pointer to new page */
                                        itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i));
                                }
@@ -554,7 +560,10 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
                                attnum = gintuple_get_attrnum(&gvs->ginstate, itup);
                                key = gintuple_get_key(&gvs->ginstate, itup, &category);
                                itup = GinFormTuple(&gvs->ginstate, attnum, key, category,
-                                                                       GinGetPosting(itup), newN, true);
+                                                                       (char *) plist, plistsize,
+                                                                       nitems, true);
+                               if (plist)
+                                       pfree(plist);
                                PageIndexTupleDelete(tmppage, i);
 
                                if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
@@ -583,6 +592,11 @@ ginbulkdelete(PG_FUNCTION_ARGS)
        BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))];
        uint32          nRoot;
 
+       gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
+                                                                          "Gin vacuum temporary context",
+                                                                          ALLOCSET_DEFAULT_MINSIZE,
+                                                                          ALLOCSET_DEFAULT_INITSIZE,
+                                                                          ALLOCSET_DEFAULT_MAXSIZE);
        gvs.index = index;
        gvs.callback = callback;
        gvs.callback_state = callback_state;
@@ -683,6 +697,8 @@ ginbulkdelete(PG_FUNCTION_ARGS)
                LockBuffer(buffer, GIN_EXCLUSIVE);
        }
 
+       MemoryContextDelete(gvs.tmpCxt);
+
        PG_RETURN_POINTER(gvs.result);
 }
 
index c13e01a3c6a55ce63cd9ca7ab999fee6a9355968..9a978ce2bdb1bdb8a24b4797e30ef384539d837b 100644 (file)
@@ -78,7 +78,7 @@ static void
 ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record)
 {
        ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record);
-       ItemPointerData *items = (ItemPointerData *) (XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree));
+       char       *ptr;
        Buffer          buffer;
        Page            page;
 
@@ -89,9 +89,14 @@ ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record)
        Assert(BufferIsValid(buffer));
        page = (Page) BufferGetPage(buffer);
 
-       GinInitBuffer(buffer, GIN_DATA | GIN_LEAF);
-       memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * data->nitem);
-       GinPageGetOpaque(page)->maxoff = data->nitem;
+       GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED);
+
+       ptr = XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree);
+
+       /* Place page data */
+       memcpy(GinDataLeafPageGetPostingList(page), ptr, data->size);
+
+       GinDataLeafPageSetPostingListSize(page, data->size);
 
        PageSetLSN(page, lsn);
 
@@ -100,11 +105,11 @@ ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record)
 }
 
 static void
-ginRedoInsertEntry(Buffer buffer, OffsetNumber offset, BlockNumber rightblkno,
-                                  void *rdata)
+ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
 {
        Page            page = BufferGetPage(buffer);
        ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata;
+       OffsetNumber offset = data->offset;
        IndexTuple      itup;
 
        if (rightblkno != InvalidBlockNumber)
@@ -138,30 +143,43 @@ ginRedoInsertEntry(Buffer buffer, OffsetNumber offset, BlockNumber rightblkno,
 }
 
 static void
-ginRedoInsertData(Buffer buffer, OffsetNumber offset, BlockNumber rightblkno,
-                                 void *rdata)
+ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
+{
+       Pointer         segment;
+
+       /* Copy the new data to the right place */
+       segment = ((Pointer) GinDataLeafPageGetPostingList(page))
+               + data->unmodifiedsize;
+       memcpy(segment, data->newdata, data->length - data->unmodifiedsize);
+       GinDataLeafPageSetPostingListSize(page, data->length);
+       GinPageSetCompressed(page);
+}
+
+static void
+ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
 {
        Page            page = BufferGetPage(buffer);
 
-       if (GinPageIsLeaf(page))
+       if (isLeaf)
        {
-               ginxlogInsertDataLeaf *data = (ginxlogInsertDataLeaf *) rdata;
-               ItemPointerData *items = data->items;
-               OffsetNumber i;
+               ginxlogRecompressDataLeaf *data = (ginxlogRecompressDataLeaf *) rdata;
 
-               for (i = 0; i < data->nitem; i++)
-                       GinDataPageAddItemPointer(page, &items[i], offset + i);
+               Assert(GinPageIsLeaf(page));
+
+               ginRedoRecompress(page, data);
        }
        else
        {
-               PostingItem *pitem = (PostingItem *) rdata;
+               ginxlogInsertDataInternal *data = (ginxlogInsertDataInternal *) rdata;
                PostingItem *oldpitem;
 
+               Assert(!GinPageIsLeaf(page));
+
                /* update link to right page after split */
-               oldpitem = GinDataPageGetPostingItem(page, offset);
+               oldpitem = GinDataPageGetPostingItem(page, data->offset);
                PostingItemSetBlockNumber(oldpitem, rightblkno);
 
-               GinDataPageAddPostingItem(page, pitem, offset);
+               GinDataPageAddPostingItem(page, &data->newitem, data->offset);
        }
 }
 
@@ -213,12 +231,12 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
                if (data->flags & GIN_INSERT_ISDATA)
                {
                        Assert(GinPageIsData(page));
-                       ginRedoInsertData(buffer, data->offset, rightChildBlkno, payload);
+                       ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload);
                }
                else
                {
                        Assert(!GinPageIsData(page));
-                       ginRedoInsertEntry(buffer, data->offset, rightChildBlkno, payload);
+                       ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload);
                }
 
                PageSetLSN(page, lsn);
@@ -253,38 +271,42 @@ ginRedoSplitEntry(Page lpage, Page rpage, void *rdata)
 static void
 ginRedoSplitData(Page lpage, Page rpage, void *rdata)
 {
-       ginxlogSplitData *data = (ginxlogSplitData *) rdata;
        bool            isleaf = GinPageIsLeaf(lpage);
-       char       *ptr = (char *) rdata + sizeof(ginxlogSplitData);
-       OffsetNumber i;
-       ItemPointer bound;
 
        if (isleaf)
        {
-               ItemPointer items = (ItemPointer) ptr;
-               for (i = 0; i < data->separator; i++)
-                       GinDataPageAddItemPointer(lpage, &items[i], InvalidOffsetNumber);
-               for (i = data->separator; i < data->nitem; i++)
-                       GinDataPageAddItemPointer(rpage, &items[i], InvalidOffsetNumber);
+               ginxlogSplitDataLeaf *data = (ginxlogSplitDataLeaf *) rdata;
+               Pointer         lptr = (Pointer) rdata + sizeof(ginxlogSplitDataLeaf);
+               Pointer         rptr = lptr + data->lsize;
+
+               Assert(data->lsize > 0 && data->lsize <= GinDataLeafMaxContentSize);
+               Assert(data->rsize > 0 && data->rsize <= GinDataLeafMaxContentSize);
+
+               memcpy(GinDataLeafPageGetPostingList(lpage), lptr, data->lsize);
+               memcpy(GinDataLeafPageGetPostingList(rpage), rptr, data->rsize);
+
+               GinDataLeafPageSetPostingListSize(lpage, data->lsize);
+               GinDataLeafPageSetPostingListSize(rpage, data->rsize);
+               *GinDataPageGetRightBound(lpage) = data->lrightbound;
+               *GinDataPageGetRightBound(rpage) = data->rrightbound;
        }
        else
        {
-               PostingItem *items = (PostingItem *) ptr;
+               ginxlogSplitDataInternal *data = (ginxlogSplitDataInternal *) rdata;
+               PostingItem *items = (PostingItem *) ((char *) rdata + sizeof(ginxlogSplitDataInternal));
+               OffsetNumber i;
+               OffsetNumber maxoff;
+
                for (i = 0; i < data->separator; i++)
                        GinDataPageAddPostingItem(lpage, &items[i], InvalidOffsetNumber);
                for (i = data->separator; i < data->nitem; i++)
                        GinDataPageAddPostingItem(rpage, &items[i], InvalidOffsetNumber);
-       }
-
-       /* set up right key */
-       bound = GinDataPageGetRightBound(lpage);
-       if (isleaf)
-               *bound = *GinDataPageGetItemPointer(lpage, GinPageGetOpaque(lpage)->maxoff);
-       else
-               *bound = GinDataPageGetPostingItem(lpage, GinPageGetOpaque(lpage)->maxoff)->key;
 
-       bound = GinDataPageGetRightBound(rpage);
-       *bound = data->rightbound;
+               /* set up right key */
+               maxoff = GinPageGetOpaque(lpage)->maxoff;
+               *GinDataPageGetRightBound(lpage) = GinDataPageGetPostingItem(lpage, maxoff)->key;
+               *GinDataPageGetRightBound(rpage) = data->rightbound;
+       }
 }
 
 static void
@@ -317,9 +339,10 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
 
        if (isLeaf)
                flags |= GIN_LEAF;
-
        if (isData)
                flags |= GIN_DATA;
+       if (isLeaf && isData)
+               flags |= GIN_COMPRESSED;
 
        lbuffer = XLogReadBuffer(data->node, data->lblkno, true);
        Assert(BufferIsValid(lbuffer));
@@ -352,7 +375,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
                Buffer          rootBuf = XLogReadBuffer(data->node, rootBlkno, true);
                Page            rootPage = BufferGetPage(rootBuf);
 
-               GinInitBuffer(rootBuf, flags & ~GIN_LEAF);
+               GinInitBuffer(rootBuf, flags & ~GIN_LEAF & ~GIN_COMPRESSED);
 
                if (isData)
                {
@@ -383,13 +406,20 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
        UnlockReleaseBuffer(lbuffer);
 }
 
+/*
+ * This is functionally the same as heap_xlog_newpage.
+ */
 static void
 ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
 {
-       ginxlogVacuumPage *data = (ginxlogVacuumPage *) XLogRecGetData(record);
+       ginxlogVacuumPage *xlrec = (ginxlogVacuumPage *) XLogRecGetData(record);
+       char       *blk = ((char *) xlrec) + sizeof(ginxlogVacuumPage);
        Buffer          buffer;
        Page            page;
 
+       Assert(xlrec->hole_offset < BLCKSZ);
+       Assert(xlrec->hole_length < BLCKSZ);
+
        /* If we have a full-page image, restore it and we're done */
        if (record->xl_info & XLR_BKP_BLOCK(0))
        {
@@ -397,41 +427,56 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
                return;
        }
 
-       buffer = XLogReadBuffer(data->node, data->blkno, false);
+       buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, true);
        if (!BufferIsValid(buffer))
                return;
        page = (Page) BufferGetPage(buffer);
 
-       if (lsn > PageGetLSN(page))
+       if (xlrec->hole_length == 0)
        {
-               if (GinPageIsData(page))
-               {
-                       memcpy(GinDataPageGetData(page),
-                                  XLogRecGetData(record) + sizeof(ginxlogVacuumPage),
-                                  data->nitem * GinSizeOfDataPageItem(page));
-                       GinPageGetOpaque(page)->maxoff = data->nitem;
-               }
-               else
-               {
-                       OffsetNumber i,
-                                          *tod;
-                       IndexTuple      itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogVacuumPage));
+               memcpy((char *) page, blk, BLCKSZ);
+       }
+       else
+       {
+               memcpy((char *) page, blk, xlrec->hole_offset);
+               /* must zero-fill the hole */
+               MemSet((char *) page + xlrec->hole_offset, 0, xlrec->hole_length);
+               memcpy((char *) page + (xlrec->hole_offset + xlrec->hole_length),
+                          blk + xlrec->hole_offset,
+                          BLCKSZ - (xlrec->hole_offset + xlrec->hole_length));
+       }
 
-                       tod = (OffsetNumber *) palloc(sizeof(OffsetNumber) * PageGetMaxOffsetNumber(page));
-                       for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++)
-                               tod[i - 1] = i;
+       PageSetLSN(page, lsn);
 
-                       PageIndexMultiDelete(page, tod, PageGetMaxOffsetNumber(page));
+       MarkBufferDirty(buffer);
+       UnlockReleaseBuffer(buffer);
+}
 
-                       for (i = 0; i < data->nitem; i++)
-                       {
-                               if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
-                                       elog(ERROR, "failed to add item to index page in %u/%u/%u",
-                                                data->node.spcNode, data->node.dbNode, data->node.relNode);
-                               itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
-                       }
-               }
+static void
+ginRedoVacuumDataLeafPage(XLogRecPtr lsn, XLogRecord *record)
+{
+       ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetData(record);
+       Buffer          buffer;
+       Page            page;
+
+       /* If we have a full-page image, restore it and we're done */
+       if (record->xl_info & XLR_BKP_BLOCK(0))
+       {
+               (void) RestoreBackupBlock(lsn, record, 0, false, false);
+               return;
+       }
 
+       buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, false);
+       if (!BufferIsValid(buffer))
+               return;
+       page = (Page) BufferGetPage(buffer);
+
+       Assert(GinPageIsLeaf(page));
+       Assert(GinPageIsData(page));
+
+       if (lsn > PageGetLSN(page))
+       {
+               ginRedoRecompress(page, &xlrec->data);
                PageSetLSN(page, lsn);
                MarkBufferDirty(buffer);
        }
@@ -747,6 +792,9 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record)
                case XLOG_GIN_VACUUM_PAGE:
                        ginRedoVacuumPage(lsn, record);
                        break;
+               case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
+                       ginRedoVacuumDataLeafPage(lsn, record);
+                       break;
                case XLOG_GIN_DELETE_PAGE:
                        ginRedoDeletePage(lsn, record);
                        break;
index f7b08ac16812d5ac736635332fd422c729a8d917..1983bcc94c7ba8504cfd2b852ff83c33c9d0afc4 100644 (file)
@@ -47,8 +47,7 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
 
                                appendStringInfoString(buf, "Insert item, ");
                                desc_node(buf, xlrec->node, xlrec->blkno);
-                               appendStringInfo(buf, " offset: %u isdata: %c isleaf: %c",
-                                                                xlrec->offset,
+                               appendStringInfo(buf, " isdata: %c isleaf: %c",
                                                                 (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F',
                                                                 (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F');
                                if (!(xlrec->flags & GIN_INSERT_ISLEAF))
@@ -67,24 +66,50 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
                                        appendStringInfo(buf, " isdelete: %c",
                                                                         (((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F');
                                else if (xlrec->flags & GIN_INSERT_ISLEAF)
-                                       appendStringInfo(buf, " nitem: %u",
-                                                                        (((ginxlogInsertDataLeaf *) payload)->nitem));
+                               {
+                                       ginxlogRecompressDataLeaf *insertData =
+                                               (ginxlogRecompressDataLeaf *) payload;
+
+                                       appendStringInfo(buf, " unmodified: %u length: %u (compressed)",
+                                                                        insertData->unmodifiedsize,
+                                                                        insertData->length);
+                               }
                                else
+                               {
+                                       ginxlogInsertDataInternal *insertData = (ginxlogInsertDataInternal *) payload;
                                        appendStringInfo(buf, " pitem: %u-%u/%u",
-                                                                        PostingItemGetBlockNumber((PostingItem *) payload),
-                                                                        ItemPointerGetBlockNumber(&((PostingItem *) payload)->key),
-                                                                        ItemPointerGetOffsetNumber(&((PostingItem *) payload)->key));
+                                                                        PostingItemGetBlockNumber(&insertData->newitem),
+                                                                        ItemPointerGetBlockNumber(&insertData->newitem.key),
+                                                                        ItemPointerGetOffsetNumber(&insertData->newitem.key));
+                               }
                        }
                        break;
                case XLOG_GIN_SPLIT:
-                       appendStringInfoString(buf, "Page split, ");
-                       desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno);
-                       appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F');
+                       {
+                               ginxlogSplit *xlrec = (ginxlogSplit *) rec;
+
+                               appendStringInfoString(buf, "Page split, ");
+                               desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno);
+                               appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F');
+                               appendStringInfo(buf, " isdata: %c isleaf: %c",
+                                                                (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F',
+                                                                (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F');
+                       }
                        break;
                case XLOG_GIN_VACUUM_PAGE:
                        appendStringInfoString(buf, "Vacuum page, ");
                        desc_node(buf, ((ginxlogVacuumPage *) rec)->node, ((ginxlogVacuumPage *) rec)->blkno);
                        break;
+               case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
+                       {
+                               ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec;
+                               appendStringInfoString(buf, "Vacuum data leaf page, ");
+                               desc_node(buf, xlrec->node, xlrec->blkno);
+                               appendStringInfo(buf, " unmodified: %u length: %u",
+                                                        xlrec->data.unmodifiedsize,
+                                                        xlrec->data.length);
+                       }
+                       break;
                case XLOG_GIN_DELETE_PAGE:
                        appendStringInfoString(buf, "Delete page, ");
                        desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno);
index 5f0fe80e807e0f7ed70ae534015e2bb9421102bb..3f92c376d72d2d467255e7c1979e6f59ce5e6f5e 100644 (file)
 typedef struct GinPageOpaqueData
 {
        BlockNumber rightlink;          /* next page if any */
-       OffsetNumber maxoff;            /* number entries on GIN_DATA page: number of
-                                                                * heap ItemPointers on GIN_DATA|GIN_LEAF page
-                                                                * or number of PostingItems on GIN_DATA &
-                                                                * ~GIN_LEAF page. On GIN_LIST page, number of
-                                                                * heap tuples. */
+       OffsetNumber maxoff;            /* number of PostingItems on GIN_DATA & ~GIN_LEAF page.
+                                                                * On GIN_LIST page, number of heap tuples. */
        uint16          flags;                  /* see bit definitions below */
 } GinPageOpaqueData;
 
@@ -49,6 +46,7 @@ typedef GinPageOpaqueData *GinPageOpaque;
 #define GIN_LIST                 (1 << 4)
 #define GIN_LIST_FULLROW  (1 << 5)             /* makes sense only on GIN_LIST page */
 #define GIN_INCOMPLETE_SPLIT (1 << 6)  /* page was split, but parent not updated */
+#define GIN_COMPRESSED   (1 << 7)
 
 /* Page numbers of fixed-location pages */
 #define GIN_METAPAGE_BLKNO     (0)
@@ -88,7 +86,12 @@ typedef struct GinMetaPageData
         * GIN version number (ideally this should have been at the front, but too
         * late now.  Don't move it!)
         *
-        * Currently 1 (for indexes initialized in 9.1 or later)
+        * Currently 2 (for indexes initialized in 9.4 or later)
+        *
+        * Version 1 (indexes initialized in version 9.1, 9.2 or 9.3), is
+        * compatible, but may contain uncompressed posting tree (leaf) pages and
+        * posting lists. They will be converted to compressed format when
+        * modified.
         *
         * Version 0 (indexes initialized in 9.0 or before) is compatible but may
         * be missing null entries, including both null keys and placeholders.
@@ -97,7 +100,7 @@ typedef struct GinMetaPageData
        int32           ginVersion;
 } GinMetaPageData;
 
-#define GIN_CURRENT_VERSION            1
+#define GIN_CURRENT_VERSION            2
 
 #define GinPageGetMeta(p) \
        ((GinMetaPageData *) PageGetContents(p))
@@ -116,6 +119,8 @@ typedef struct GinMetaPageData
 #define GinPageSetList(page)   ( GinPageGetOpaque(page)->flags |= GIN_LIST )
 #define GinPageHasFullRow(page)    ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW )
 #define GinPageSetFullRow(page)   ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW )
+#define GinPageIsCompressed(page)    ( GinPageGetOpaque(page)->flags & GIN_COMPRESSED )
+#define GinPageSetCompressed(page)   ( GinPageGetOpaque(page)->flags |= GIN_COMPRESSED )
 
 #define GinPageIsDeleted(page) ( GinPageGetOpaque(page)->flags & GIN_DELETED)
 #define GinPageSetDeleted(page)    ( GinPageGetOpaque(page)->flags |= GIN_DELETED)
@@ -213,13 +218,16 @@ typedef signed char GinNullCategory;
 #define GinSetPostingTree(itup, blkno) ( GinSetNPosting((itup),GIN_TREE_POSTING), ItemPointerSetBlockNumber(&(itup)->t_tid, blkno) )
 #define GinGetPostingTree(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid)
 
-#define GinGetPostingOffset(itup)      GinItemPointerGetBlockNumber(&(itup)->t_tid)
-#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,n)
-#define GinGetPosting(itup)                    ((ItemPointer) ((char*)(itup) + GinGetPostingOffset(itup)))
+#define GIN_ITUP_COMPRESSED            (1 << 31)
+#define GinGetPostingOffset(itup)      (GinItemPointerGetBlockNumber(&(itup)->t_tid) & (~GIN_ITUP_COMPRESSED))
+#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,(n)|GIN_ITUP_COMPRESSED)
+#define GinGetPosting(itup)                    ((Pointer) ((char*)(itup) + GinGetPostingOffset(itup)))
+#define GinItupIsCompressed(itup)      (GinItemPointerGetBlockNumber(&(itup)->t_tid) & GIN_ITUP_COMPRESSED)
 
 #define GinMaxItemSize \
-       MAXALIGN_DOWN(((BLCKSZ - SizeOfPageHeaderData - \
-               MAXALIGN(sizeof(GinPageOpaqueData))) / 3 - sizeof(ItemIdData)))
+       Min(INDEX_SIZE_MASK, \
+               MAXALIGN_DOWN(((BLCKSZ - SizeOfPageHeaderData -                                 \
+                                               MAXALIGN(sizeof(GinPageOpaqueData))) / 6 - sizeof(ItemIdData))))
 
 /*
  * Access macros for non-leaf entry tuples
@@ -230,30 +238,59 @@ typedef signed char GinNullCategory;
 
 /*
  * Data (posting tree) pages
+ *
+ * Posting tree pages don't store regular tuples. Non-leaf pages contain
+ * PostingItems, which are pairs of ItemPointers and child block numbers.
+ * Leaf pages contain GinPostingLists and an uncompressed array of item
+ * pointers.
+ *
+ * In a leaf page, the compressed posting lists are stored after the regular
+ * page header, one after each other. Although we don't store regular tuples,
+ * pd_lower is used to indicate the end of the posting lists. After that, free
+ * space follows.  This layout is compatible with the "standard" heap and
+ * index page layout described in bufpage.h, so that we can e.g set buffer_std
+ * when writing WAL records.
+ *
+ * In the special space is the GinPageOpaque struct.
  */
+#define GinDataLeafPageGetPostingList(page) \
+       (GinPostingList *) ((PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData))))
+#define GinDataLeafPageGetPostingListSize(page) \
+       (((PageHeader) page)->pd_lower - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(ItemPointerData)))
+#define GinDataLeafPageSetPostingListSize(page, size) \
+       { \
+               Assert(size <= GinDataLeafMaxContentSize); \
+               ((PageHeader) page)->pd_lower = (size) + MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(ItemPointerData)); \
+       }
+
+#define GinDataLeafPageIsEmpty(page) \
+       (GinPageIsCompressed(page) ? (GinDataLeafPageGetPostingListSize(page) == 0) : (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber))
+
+#define GinDataLeafPageGetFreeSpace(page) PageGetExactFreeSpace(page)
+
 #define GinDataPageGetRightBound(page) ((ItemPointer) PageGetContents(page))
+/*
+ * Pointer to the data portion of a posting tree page. For internal pages,
+ * that's the beginning of the array of PostingItems. For compressed leaf
+ * pages, the first compressed posting list. For uncompressed (pre-9.4) leaf
+ * pages, it's the beginning of the ItemPointer array.
+ */
 #define GinDataPageGetData(page)       \
        (PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData)))
 /* non-leaf pages contain PostingItems */
 #define GinDataPageGetPostingItem(page, i)     \
        ((PostingItem *) (GinDataPageGetData(page) + ((i)-1) * sizeof(PostingItem)))
-/* leaf pages contain ItemPointers */
-#define GinDataPageGetItemPointer(page, i)     \
-       ((ItemPointer) (GinDataPageGetData(page) + ((i)-1) * sizeof(ItemPointerData)))
-#define GinSizeOfDataPageItem(page) \
-       (GinPageIsLeaf(page) ? sizeof(ItemPointerData) : sizeof(PostingItem))
 
-#define GinDataPageGetFreeSpace(page)  \
+#define GinNonLeafDataPageGetFreeSpace(page)   \
        (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
         - MAXALIGN(sizeof(ItemPointerData)) \
-        - GinPageGetOpaque(page)->maxoff * GinSizeOfDataPageItem(page) \
+        - GinPageGetOpaque(page)->maxoff * sizeof(PostingItem) \
         - MAXALIGN(sizeof(GinPageOpaqueData)))
 
-#define GinMaxLeafDataItems \
-       ((BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
-         MAXALIGN(sizeof(ItemPointerData)) - \
-         MAXALIGN(sizeof(GinPageOpaqueData))) \
-        / sizeof(ItemPointerData))
+#define GinDataLeafMaxContentSize      \
+       (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
+        - MAXALIGN(sizeof(ItemPointerData)) \
+        - MAXALIGN(sizeof(GinPageOpaqueData)))
 
 /*
  * List pages
@@ -318,6 +355,23 @@ typedef struct GinState
        Oid                     supportCollation[INDEX_MAX_KEYS];
 } GinState;
 
+
+/*
+ * A compressed posting list.
+ *
+ * Note: This requires 2-byte alignment.
+ */
+typedef struct
+{
+       ItemPointerData first;  /* first item in this posting list (unpacked) */
+       uint16          nbytes;         /* number of bytes that follow */
+       unsigned char bytes[1]; /* varbyte encoded items (variable length) */
+} GinPostingList;
+
+#define SizeOfGinPostingList(plist) (offsetof(GinPostingList, bytes) + SHORTALIGN((plist)->nbytes) )
+#define GinNextPostingListSegment(cur) ((GinPostingList *) (((char *) (cur)) + SizeOfGinPostingList((cur))))
+
+
 /* XLog stuff */
 
 #define XLOG_GIN_CREATE_INDEX  0x00
@@ -328,18 +382,21 @@ typedef struct ginxlogCreatePostingTree
 {
        RelFileNode node;
        BlockNumber blkno;
-       uint32          nitem;
-       /* follows list of heap's ItemPointer */
+       uint32          size;
+       /* A compressed posting list follows */
 } ginxlogCreatePostingTree;
 
 #define XLOG_GIN_INSERT  0x20
 
-typedef struct ginxlogInsert
+/*
+ * The format of the insertion record varies depending on the page type.
+ * ginxlogInsert is the common part between all variants.
+ */
+typedef struct
 {
        RelFileNode node;
        BlockNumber blkno;
        uint16          flags;          /* GIN_SPLIT_ISLEAF and/or GIN_SPLIT_ISDATA */
-       OffsetNumber offset;
 
        /*
         * FOLLOWS:
@@ -358,17 +415,25 @@ typedef struct ginxlogInsert
 
 typedef struct
 {
+       OffsetNumber offset;
        bool            isDelete;
        IndexTupleData tuple;   /* variable length */
 } ginxlogInsertEntry;
 
 typedef struct
 {
-       OffsetNumber nitem;
-       ItemPointerData items[1]; /* variable length */
-} ginxlogInsertDataLeaf;
+       uint16          length;
+       uint16          unmodifiedsize;
 
-/* In an insert to an internal data page, the payload is a PostingItem */
+       /* compressed segments, variable length */
+       char            newdata[1];
+} ginxlogRecompressDataLeaf;
+
+typedef struct
+{
+       OffsetNumber offset;
+       PostingItem newitem;
+} ginxlogInsertDataInternal;
 
 
 #define XLOG_GIN_SPLIT 0x30
@@ -401,25 +466,58 @@ typedef struct
        /* FOLLOWS: IndexTuples */
 } ginxlogSplitEntry;
 
+typedef struct
+{
+       uint16          lsize;
+       uint16          rsize;
+       ItemPointerData lrightbound;    /* new right bound of left page */
+       ItemPointerData rrightbound;    /* new right bound of right page */
+
+       /* FOLLOWS: new compressed posting lists of left and right page */
+       char            newdata[1];
+} ginxlogSplitDataLeaf;
+
 typedef struct
 {
        OffsetNumber separator;
        OffsetNumber nitem;
        ItemPointerData rightbound;
 
-       /* FOLLOWS: array of ItemPointers (for leaf) or PostingItems (non-leaf) */
-} ginxlogSplitData;
+       /* FOLLOWS: array of PostingItems */
+} ginxlogSplitDataInternal;
 
+/*
+ * Vacuum simply WAL-logs the whole page, when anything is modified. This
+ * functionally identical heap_newpage records, but is kept separate for
+ * debugging purposes. (When inspecting the WAL stream, it's easier to see
+ * what's going on when GIN vacuum records are marked as such, not as heap
+ * records.) This is currently only used for entry tree leaf pages.
+ */
 #define XLOG_GIN_VACUUM_PAGE   0x40
 
 typedef struct ginxlogVacuumPage
 {
        RelFileNode node;
        BlockNumber blkno;
-       OffsetNumber nitem;
-       /* follows content of page */
+       uint16          hole_offset;    /* number of bytes before "hole" */
+       uint16          hole_length;    /* number of bytes in "hole" */
+       /* entire page contents (minus the hole) follow at end of record */
 } ginxlogVacuumPage;
 
+/*
+ * Vacuuming posting tree leaf page is WAL-logged like recompression caused
+ * by insertion.
+ */
+#define XLOG_GIN_VACUUM_DATA_LEAF_PAGE 0x90
+
+typedef struct ginxlogVacuumDataLeafPage
+{
+       RelFileNode node;
+       BlockNumber blkno;
+
+       ginxlogRecompressDataLeaf data;
+} ginxlogVacuumDataLeafPage;
+
 #define XLOG_GIN_DELETE_PAGE   0x50
 
 typedef struct ginxlogDeletePage
@@ -506,6 +604,7 @@ typedef struct GinBtreeStack
        BlockNumber blkno;
        Buffer          buffer;
        OffsetNumber off;
+       ItemPointerData iptr;
        /* predictNumber contains predicted number of pages on current level */
        uint32          predictNumber;
        struct GinBtreeStack *parent;
@@ -513,6 +612,14 @@ typedef struct GinBtreeStack
 
 typedef struct GinBtreeData *GinBtree;
 
+/* Return codes for GinBtreeData.placeToPage method */
+typedef enum
+{
+       UNMODIFIED,
+       INSERTED,
+       SPLIT
+} GinPlaceToPageRC;
+
 typedef struct GinBtreeData
 {
        /* search methods */
@@ -523,8 +630,7 @@ typedef struct GinBtreeData
 
        /* insert methods */
        OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber);
-       bool            (*placeToPage) (GinBtree, Buffer, OffsetNumber, void *, BlockNumber, XLogRecData **);
-       Page            (*splitPage) (GinBtree, Buffer, Buffer, OffsetNumber, void *, BlockNumber, XLogRecData **);
+       GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, XLogRecData **, Page *, Page *);
        void       *(*prepareDownlink) (GinBtree, Buffer);
        void            (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page);
 
@@ -577,14 +683,17 @@ extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
 /* ginentrypage.c */
 extern IndexTuple GinFormTuple(GinState *ginstate,
                         OffsetNumber attnum, Datum key, GinNullCategory category,
-                        ItemPointerData *ipd, uint32 nipd, bool errorTooBig);
-extern void GinShortenTuple(IndexTuple itup, uint32 nipd);
+                        Pointer data, Size dataSize, int nipd, bool errorTooBig);
 extern void ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
                                        Datum key, GinNullCategory category,
                                        GinState *ginstate);
 extern void ginEntryFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
+extern ItemPointer ginReadTuple(GinState *ginstate, OffsetNumber attnum,
+                        IndexTuple itup, int *nitems);
 
 /* gindatapage.c */
+extern ItemPointer GinDataLeafPageGetItems(Page page, int *nitems);
+extern int GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm);
 extern BlockNumber createPostingTree(Relation index,
                                  ItemPointerData *items, uint32 nitems,
                                  GinStatsData *buildStats);
@@ -598,6 +707,15 @@ extern GinBtreeStack *ginScanBeginPostingTree(Relation index, BlockNumber rootBl
 extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
 extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno);
 
+/*
+ * This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers
+ * and ginVacuumPostingTreeLeaf and as an opaque struct, so we need a forward
+ * declaration for it.
+ */
+typedef struct GinVacuumState GinVacuumState;
+
+extern void ginVacuumPostingTreeLeaf(Relation rel, Buffer buf, GinVacuumState *gvs);
+
 /* ginscan.c */
 
 /*
@@ -679,7 +797,7 @@ typedef struct GinScanEntryData
 
        /* used for Posting list and one page in Posting tree */
        ItemPointerData *list;
-       uint32          nlist;
+       int                     nlist;
        OffsetNumber offset;
 
        bool            isFinished;
@@ -717,6 +835,8 @@ extern Datum gingetbitmap(PG_FUNCTION_ARGS);
 /* ginvacuum.c */
 extern Datum ginbulkdelete(PG_FUNCTION_ARGS);
 extern Datum ginvacuumcleanup(PG_FUNCTION_ARGS);
+extern ItemPointer ginVacuumItemPointers(GinVacuumState *gvs,
+                                         ItemPointerData *items, int nitem, int *nremaining);
 
 /* ginbulk.c */
 typedef struct GinEntryAccumulator
@@ -770,11 +890,17 @@ extern void ginInsertCleanup(GinState *ginstate,
                                 bool vac_delay, IndexBulkDeleteResult *stats);
 
 /* ginpostinglist.c */
-extern uint32 ginMergeItemPointers(ItemPointerData *dst,
+
+extern GinPostingList *ginCompressPostingList(const ItemPointer ptrs, int nptrs,
+                                                                  int maxsize, int *nwritten);
+extern int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int totalsize, TIDBitmap *tbm);
+
+extern ItemPointer ginPostingListDecodeAllSegments(GinPostingList *ptr, int len, int *ndecoded);
+extern ItemPointer ginPostingListDecode(GinPostingList *ptr, int *ndecoded);
+extern int ginMergeItemPointers(ItemPointerData *dst,
                                         ItemPointerData *a, uint32 na,
                                         ItemPointerData *b, uint32 nb);
 
-
 /*
  * Merging the results of several gin scans compares item pointers a lot,
  * so we want this to be inlined. But if the compiler doesn't support that,