]> granicus.if.org Git - postgresql/blobdiff - src/backend/access/gist/gistbuild.c
Report progress of CREATE INDEX operations
[postgresql] / src / backend / access / gist / gistbuild.c
index be1b202d39d7163d17b6dad934cde796afbc9106..6024671989e93cf94bcf095f7d5a71a6bd499958 100644 (file)
@@ -4,7 +4,7 @@
  *       build algorithm for GiST indexes implementation.
  *
  *
- * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/gistxlog.h"
+#include "access/tableam.h"
+#include "access/xloginsert.h"
 #include "catalog/index.h"
 #include "miscadmin.h"
-#include "optimizer/cost.h"
+#include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"
@@ -48,23 +51,32 @@ typedef enum
                                                                 * before switching to the buffering build
                                                                 * mode */
        GIST_BUFFERING_ACTIVE           /* in buffering build mode */
-}      GistBufferingMode;
+} GistBufferingMode;
 
 /* Working state for gistbuild and its callback */
 typedef struct
 {
        Relation        indexrel;
+       Relation        heaprel;
        GISTSTATE  *giststate;
-       GISTBuildBuffers *gfbb;
 
        int64           indtuples;              /* number of tuples indexed */
        int64           indtuplesSize;  /* total size of all indexed tuples */
 
        Size            freespace;              /* amount of free space to leave on pages */
 
+       /*
+        * Extra data structures used during a buffering build. 'gfbb' contains
+        * information related to managing the build buffers. 'parentMap' is a
+        * lookup table of the parent of each internal page.
+        */
+       GISTBuildBuffers *gfbb;
+       HTAB       *parentMap;
+
        GistBufferingMode bufferingMode;
 } GISTBuildState;
 
+/* prototypes for private functions */
 static void gistInitBuffering(GISTBuildState *buildstate);
 static int     calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
 static void gistBuildCallback(Relation index,
@@ -76,30 +88,33 @@ static void gistBuildCallback(Relation index,
 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
                                                 IndexTuple itup);
 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
-                               GISTBufferingInsertStack *startparent);
-static void gistbufferinginserttuples(GISTBuildState *buildstate,
-                                                 Buffer buffer,
+                               BlockNumber startblkno, int startlevel);
+static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
+                                                 Buffer buffer, int level,
                                                  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
-                                                 GISTBufferingInsertStack *path);
-static void gistBufferingFindCorrectParent(GISTBuildState *buildstate,
-                                                          GISTBufferingInsertStack *child);
+                                                 BlockNumber parentblk, OffsetNumber downlinkoffnum);
+static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
+                                                          BlockNumber childblkno, int level,
+                                                          BlockNumber *parentblk,
+                                                          OffsetNumber *downlinkoffnum);
 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
-static void gistFreeUnreferencedPath(GISTBufferingInsertStack *path);
 static int     gistGetMaxLevel(Relation index);
 
+static void gistInitParentMap(GISTBuildState *buildstate);
+static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
+                                  BlockNumber parent);
+static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
+static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
 
 /*
  * Main entry point to GiST index build. Initially calls insert over and over,
  * but switches to more efficient buffering build algorithm after a certain
  * number of tuples (unless buffering mode is disabled).
  */
-Datum
-gistbuild(PG_FUNCTION_ARGS)
+IndexBuildResult *
+gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 {
-       Relation        heap = (Relation) PG_GETARG_POINTER(0);
-       Relation        index = (Relation) PG_GETARG_POINTER(1);
-       IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
        IndexBuildResult *result;
        double          reltuples;
        GISTBuildState buildstate;
@@ -109,6 +124,7 @@ gistbuild(PG_FUNCTION_ARGS)
        int                     fillfactor;
 
        buildstate.indexrel = index;
+       buildstate.heaprel = heap;
        if (index->rd_options)
        {
                /* Get buffering mode from the options string */
@@ -168,19 +184,15 @@ gistbuild(PG_FUNCTION_ARGS)
        if (RelationNeedsWAL(index))
        {
                XLogRecPtr      recptr;
-               XLogRecData rdata;
 
-               rdata.data = (char *) &(index->rd_node);
-               rdata.len = sizeof(RelFileNode);
-               rdata.buffer = InvalidBuffer;
-               rdata.next = NULL;
+               XLogBeginInsert();
+               XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
 
-               recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
+               recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX);
                PageSetLSN(page, recptr);
-               PageSetTLI(page, ThisTimeLineID);
        }
        else
-               PageSetLSN(page, GetXLogRecPtrForTemp());
+               PageSetLSN(page, gistGetFakeLSN(heap));
 
        UnlockReleaseBuffer(buffer);
 
@@ -193,8 +205,9 @@ gistbuild(PG_FUNCTION_ARGS)
        /*
         * Do the heap scan.
         */
-       reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
-                                                                  gistBuildCallback, (void *) &buildstate);
+       reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+                                                                          gistBuildCallback,
+                                                                          (void *) &buildstate, NULL);
 
        /*
         * If buffering was used, flush out all the tuples that are still in the
@@ -204,6 +217,7 @@ gistbuild(PG_FUNCTION_ARGS)
        {
                elog(DEBUG1, "all tuples processed, emptying buffers");
                gistEmptyAllBuffers(&buildstate);
+               gistFreeBuildBuffers(buildstate.gfbb);
        }
 
        /* okay, all heap tuples are indexed */
@@ -220,7 +234,7 @@ gistbuild(PG_FUNCTION_ARGS)
        result->heap_tuples = reltuples;
        result->index_tuples = (double) buildstate.indtuples;
 
-       PG_RETURN_POINTER(result);
+       return result;
 }
 
 /*
@@ -228,7 +242,7 @@ gistbuild(PG_FUNCTION_ARGS)
  * and "auto" values.
  */
 void
-gistValidateBufferingOption(char *value)
+gistValidateBufferingOption(const char *value)
 {
        if (value == NULL ||
                (strcmp(value, "on") != 0 &&
@@ -238,7 +252,7 @@ gistValidateBufferingOption(char *value)
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                 errmsg("invalid value for \"buffering\" option"),
-                          errdetail("Valid values are \"on\", \"off\" and \"auto\".")));
+                                errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
        }
 }
 
@@ -285,10 +299,10 @@ gistInitBuffering(GISTBuildState *buildstate)
        itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
        for (i = 0; i < index->rd_att->natts; i++)
        {
-               if (index->rd_att->attrs[i]->attlen < 0)
+               if (TupleDescAttr(index->rd_att, i)->attlen < 0)
                        itupMinSize += VARHDRSZ;
                else
-                       itupMinSize += index->rd_att->attrs[i]->attlen;
+                       itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
        }
 
        /* Calculate average and maximal number of index tuples which fit to page */
@@ -313,8 +327,8 @@ gistInitBuffering(GISTBuildState *buildstate)
         * calculating levelStep is very close to Arge et al's formula. For a
         * large B, our formula gives a value that is 2x higher.
         *
-        * The average size of a subtree of depth n can be calculated as a
-        * geometric series:
+        * The average size (in pages) of a subtree of depth n can be calculated
+        * as a geometric series:
         *
         * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
         *
@@ -343,14 +357,28 @@ gistInitBuffering(GISTBuildState *buildstate)
         * the hash table.
         */
        levelStep = 1;
-       while (
-       /* subtree must fit in cache (with safety factor of 4) */
-                  (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) / (1 - avgIndexTuplesPerPage) < effective_cache_size / 4
-                  &&
-       /* each node in the lowest level of a subtree has one page in memory */
-                  (pow(maxIndexTuplesPerPage, (double) levelStep) < (maintenance_work_mem * 1024) / BLCKSZ)
-               )
+       for (;;)
        {
+               double          subtreesize;
+               double          maxlowestlevelpages;
+
+               /* size of an average subtree at this levelStep (in pages). */
+               subtreesize =
+                       (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
+                       (1 - avgIndexTuplesPerPage);
+
+               /* max number of pages at the lowest level of a subtree */
+               maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
+
+               /* subtree must fit in cache (with safety factor of 4) */
+               if (subtreesize > effective_cache_size / 4)
+                       break;
+
+               /* each node in the lowest level of a subtree has one page in memory */
+               if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
+                       break;
+
+               /* Good, we can handle this levelStep. See if we can go one higher. */
                levelStep++;
        }
 
@@ -382,6 +410,8 @@ gistInitBuffering(GISTBuildState *buildstate)
        buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
                                                                                        gistGetMaxLevel(index));
 
+       gistInitParentMap(buildstate);
+
        buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
 
        elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
@@ -426,7 +456,7 @@ calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
 }
 
 /*
- * Per-tuple callback from IndexBuildHeapScan.
+ * Per-tuple callback for table_index_build_scan.
  */
 static void
 gistBuildCallback(Relation index,
@@ -458,7 +488,7 @@ gistBuildCallback(Relation index,
                 * locked, we call gistdoinsert directly.
                 */
                gistdoinsert(index, itup, buildstate->freespace,
-                                        buildstate->giststate);
+                                        buildstate->giststate, buildstate->heaprel);
        }
 
        /* Update tuple count and total size. */
@@ -504,7 +534,7 @@ static void
 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
 {
        /* Insert the tuple to buffers. */
-       gistProcessItup(buildstate, itup, NULL);
+       gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
 
        /* If we filled up (half of a) buffer, process buffer emptying. */
        gistProcessEmptyingQueue(buildstate);
@@ -518,30 +548,28 @@ gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
  */
 static bool
 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
-                               GISTBufferingInsertStack *startparent)
+                               BlockNumber startblkno, int startlevel)
 {
        GISTSTATE  *giststate = buildstate->giststate;
        GISTBuildBuffers *gfbb = buildstate->gfbb;
        Relation        indexrel = buildstate->indexrel;
-       GISTBufferingInsertStack *path;
        BlockNumber childblkno;
        Buffer          buffer;
        bool            result = false;
+       BlockNumber blkno;
+       int                     level;
+       OffsetNumber downlinkoffnum = InvalidOffsetNumber;
+       BlockNumber parentblkno = InvalidBlockNumber;
 
-       /*
-        * NULL passed in startparent means that we start index tuple processing
-        * from the root.
-        */
-       if (!startparent)
-               path = gfbb->rootitem;
-       else
-               path = startparent;
+       CHECK_FOR_INTERRUPTS();
 
        /*
         * Loop until we reach a leaf page (level == 0) or a level with buffers
         * (not including the level we start at, because we would otherwise make
         * no progress).
         */
+       blkno = startblkno;
+       level = startlevel;
        for (;;)
        {
                ItemId          iid;
@@ -549,21 +577,21 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
                                        newtup;
                Page            page;
                OffsetNumber childoffnum;
-               GISTBufferingInsertStack *parent;
 
                /* Have we reached a level with buffers? */
-               if (LEVEL_HAS_BUFFERS(path->level, gfbb) && path != startparent)
+               if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
                        break;
 
                /* Have we reached a leaf page? */
-               if (path->level == 0)
+               if (level == 0)
                        break;
 
                /*
                 * Nope. Descend down to the next level then. Choose a child to
                 * descend down to.
                 */
-               buffer = ReadBuffer(indexrel, path->blkno);
+
+               buffer = ReadBuffer(indexrel, blkno);
                LockBuffer(buffer, GIST_EXCLUSIVE);
 
                page = (Page) BufferGetPage(buffer);
@@ -572,32 +600,38 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
                idxtuple = (IndexTuple) PageGetItem(page, iid);
                childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 
+               if (level > 1)
+                       gistMemorizeParent(buildstate, childblkno, blkno);
+
                /*
                 * Check that the key representing the target child node is consistent
                 * with the key we're inserting. Update it if it's not.
                 */
                newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
                if (newtup)
-                       gistbufferinginserttuples(buildstate, buffer, &newtup, 1,
-                                                                         childoffnum, path);
-               UnlockReleaseBuffer(buffer);
+               {
+                       blkno = gistbufferinginserttuples(buildstate,
+                                                                                         buffer,
+                                                                                         level,
+                                                                                         &newtup,
+                                                                                         1,
+                                                                                         childoffnum,
+                                                                                         InvalidBlockNumber,
+                                                                                         InvalidOffsetNumber);
+                       /* gistbufferinginserttuples() released the buffer */
+               }
+               else
+                       UnlockReleaseBuffer(buffer);
 
-               /* Create new path item representing current page */
-               parent = path;
-               path = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
-                                                                                  sizeof(GISTBufferingInsertStack));
-               path->parent = parent;
-               path->level = parent->level - 1;
-               path->blkno = childblkno;
-               path->downlinkoffnum = childoffnum;
-               path->refCount = 0;             /* it's unreferenced for now */
-
-               /* Adjust reference count of parent */
-               if (parent)
-                       parent->refCount++;
+               /* Descend to the child */
+               parentblkno = blkno;
+               blkno = childblkno;
+               downlinkoffnum = childoffnum;
+               Assert(level > 0);
+               level--;
        }
 
-       if (LEVEL_HAS_BUFFERS(path->level, gfbb))
+       if (LEVEL_HAS_BUFFERS(level, gfbb))
        {
                /*
                 * We've reached level with buffers. Place the index tuple to the
@@ -606,8 +640,7 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
                GISTNodeBuffer *childNodeBuffer;
 
                /* Find the buffer or create a new one */
-               childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, path->blkno,
-                                                                                path->downlinkoffnum, path->parent);
+               childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
 
                /* Add index tuple to it */
                gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
@@ -620,19 +653,15 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
                /*
                 * We've reached a leaf page. Place the tuple here.
                 */
-               buffer = ReadBuffer(indexrel, path->blkno);
+               Assert(level == 0);
+               buffer = ReadBuffer(indexrel, blkno);
                LockBuffer(buffer, GIST_EXCLUSIVE);
-               gistbufferinginserttuples(buildstate, buffer, &itup, 1,
-                                                                 InvalidOffsetNumber, path);
-               UnlockReleaseBuffer(buffer);
+               gistbufferinginserttuples(buildstate, buffer, level,
+                                                                 &itup, 1, InvalidOffsetNumber,
+                                                                 parentblkno, downlinkoffnum);
+               /* gistbufferinginserttuples() released the buffer */
        }
 
-       /*
-        * Free unreferenced path items, if any. Path item may be referenced by
-        * node buffer.
-        */
-       gistFreeUnreferencedPath(path);
-
        return result;
 }
 
@@ -640,24 +669,33 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
  * Insert tuples to a given page.
  *
  * This is analogous with gistinserttuples() in the regular insertion code.
+ *
+ * Returns the block number of the page where the (first) new or updated tuple
+ * was inserted. Usually that's the original page, but might be a sibling page
+ * if the original page was split.
+ *
+ * Caller should hold a lock on 'buffer' on entry. This function will unlock
+ * and unpin it.
  */
-static void
-gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
+static BlockNumber
+gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
                                                  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
-                                                 GISTBufferingInsertStack *path)
+                                                 BlockNumber parentblk, OffsetNumber downlinkoffnum)
 {
        GISTBuildBuffers *gfbb = buildstate->gfbb;
        List       *splitinfo;
        bool            is_split;
+       BlockNumber placed_to_blk = InvalidBlockNumber;
 
        is_split = gistplacetopage(buildstate->indexrel,
                                                           buildstate->freespace,
                                                           buildstate->giststate,
                                                           buffer,
-                                                          itup, ntup, oldoffnum,
+                                                          itup, ntup, oldoffnum, &placed_to_blk,
                                                           InvalidBuffer,
                                                           &splitinfo,
-                                                          false);
+                                                          false,
+                                                          buildstate->heaprel);
 
        /*
         * If this is a root split, update the root path item kept in memory. This
@@ -667,33 +705,41 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
         */
        if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
        {
-               GISTBufferingInsertStack *oldroot = gfbb->rootitem;
                Page            page = BufferGetPage(buffer);
-               ItemId          iid;
-               IndexTuple      idxtuple;
-               BlockNumber leftmostchild;
+               OffsetNumber off;
+               OffsetNumber maxoff;
 
-               gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
-                                                       gfbb->context, sizeof(GISTBufferingInsertStack));
-               gfbb->rootitem->parent = NULL;
-               gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
-               gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
-               gfbb->rootitem->level = oldroot->level + 1;
-               gfbb->rootitem->refCount = 1;
+               Assert(level == gfbb->rootlevel);
+               gfbb->rootlevel++;
+
+               elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
 
                /*
                 * All the downlinks on the old root page are now on one of the child
-                * pages. Change the block number of the old root entry in the stack
-                * to point to the leftmost child. The other child pages will be
-                * accessible from there by walking right.
+                * pages. Visit all the new child pages to memorize the parents of the
+                * grandchildren.
                 */
-               iid = PageGetItemId(page, FirstOffsetNumber);
-               idxtuple = (IndexTuple) PageGetItem(page, iid);
-               leftmostchild = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+               if (gfbb->rootlevel > 1)
+               {
+                       maxoff = PageGetMaxOffsetNumber(page);
+                       for (off = FirstOffsetNumber; off <= maxoff; off++)
+                       {
+                               ItemId          iid = PageGetItemId(page, off);
+                               IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
+                               BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+                               Buffer          childbuf = ReadBuffer(buildstate->indexrel, childblkno);
+
+                               LockBuffer(childbuf, GIST_SHARE);
+                               gistMemorizeAllDownlinks(buildstate, childbuf);
+                               UnlockReleaseBuffer(childbuf);
 
-               oldroot->parent = gfbb->rootitem;
-               oldroot->blkno = leftmostchild;
-               oldroot->downlinkoffnum = InvalidOffsetNumber;
+                               /*
+                                * Also remember that the parent of the new child page is the
+                                * root block.
+                                */
+                               gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
+                       }
+               }
        }
 
        if (splitinfo)
@@ -701,7 +747,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
                /*
                 * Insert the downlinks to the parent. This is analogous with
                 * gistfinishsplit() in the regular insertion code, but the locking is
-                * simpler, and we have to maintain the buffers.
+                * simpler, and we have to maintain the buffers on internal nodes and
+                * the parent map.
                 */
                IndexTuple *downlinks;
                int                     ndownlinks,
@@ -710,7 +757,12 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
                ListCell   *lc;
 
                /* Parent may have changed since we memorized this path. */
-               gistBufferingFindCorrectParent(buildstate, path);
+               parentBuffer =
+                       gistBufferingFindCorrectParent(buildstate,
+                                                                                  BufferGetBlockNumber(buffer),
+                                                                                  level,
+                                                                                  &parentblk,
+                                                                                  &downlinkoffnum);
 
                /*
                 * If there's a buffer associated with this page, that needs to be
@@ -722,7 +774,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
                gistRelocateBuildBuffersOnSplit(gfbb,
                                                                                buildstate->giststate,
                                                                                buildstate->indexrel,
-                                                                               path, buffer, splitinfo);
+                                                                               level,
+                                                                               buffer, splitinfo);
 
                /* Create an array of all the downlink tuples */
                ndownlinks = list_length(splitinfo);
@@ -732,125 +785,134 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
                {
                        GISTPageSplitInfo *splitinfo = lfirst(lc);
 
+                       /*
+                        * Remember the parent of each new child page in our parent map.
+                        * This assumes that the downlinks fit on the parent page. If the
+                        * parent page is split, too, when we recurse up to insert the
+                        * downlinks, the recursive gistbufferinginserttuples() call will
+                        * update the map again.
+                        */
+                       if (level > 0)
+                               gistMemorizeParent(buildstate,
+                                                                  BufferGetBlockNumber(splitinfo->buf),
+                                                                  BufferGetBlockNumber(parentBuffer));
+
+                       /*
+                        * Also update the parent map for all the downlinks that got moved
+                        * to a different page. (actually this also loops through the
+                        * downlinks that stayed on the original page, but it does no
+                        * harm).
+                        */
+                       if (level > 1)
+                               gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
+
                        /*
                         * Since there's no concurrent access, we can release the lower
-                        * level buffers immediately. Don't release the buffer for the
-                        * original page, though, because the caller will release that.
+                        * level buffers immediately. This includes the original page.
                         */
-                       if (splitinfo->buf != buffer)
-                               UnlockReleaseBuffer(splitinfo->buf);
+                       UnlockReleaseBuffer(splitinfo->buf);
                        downlinks[i++] = splitinfo->downlink;
                }
 
                /* Insert them into parent. */
-               parentBuffer = ReadBuffer(buildstate->indexrel, path->parent->blkno);
-               LockBuffer(parentBuffer, GIST_EXCLUSIVE);
-               gistbufferinginserttuples(buildstate, parentBuffer,
-                                                                 downlinks, ndownlinks,
-                                                                 path->downlinkoffnum, path->parent);
-               UnlockReleaseBuffer(parentBuffer);
-
-               list_free_deep(splitinfo);              /* we don't need this anymore */
+               gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
+                                                                 downlinks, ndownlinks, downlinkoffnum,
+                                                                 InvalidBlockNumber, InvalidOffsetNumber);
+
+               list_free_deep(splitinfo);      /* we don't need this anymore */
        }
+       else
+               UnlockReleaseBuffer(buffer);
+
+       return placed_to_blk;
 }
 
 /*
- * Find correct parent by following rightlinks in buffering index build. This
- * method of parent searching is possible because no concurrent activity is
- * possible while index builds.
+ * Find the downlink pointing to a child page.
+ *
+ * 'childblkno' indicates the child page to find the parent for. 'level' is
+ * the level of the child. On entry, *parentblkno and *downlinkoffnum can
+ * point to a location where the downlink used to be - we will check that
+ * location first, and save some cycles if it hasn't moved. The function
+ * returns a buffer containing the downlink, exclusively-locked, and
+ * *parentblkno and *downlinkoffnum are set to the real location of the
+ * downlink.
+ *
+ * If the child page is a leaf (level == 0), the caller must supply a correct
+ * parentblkno. Otherwise we use the parent map hash table to find the parent
+ * block.
+ *
+ * This function serves the same purpose as gistFindCorrectParent() during
+ * normal index inserts, but this is simpler because we don't need to deal
+ * with concurrent inserts.
  */
-static void
+static Buffer
 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
-                                                          GISTBufferingInsertStack *child)
+                                                          BlockNumber childblkno, int level,
+                                                          BlockNumber *parentblkno,
+                                                          OffsetNumber *downlinkoffnum)
 {
-       GISTBuildBuffers *gfbb = buildstate->gfbb;
-       Relation        indexrel = buildstate->indexrel;
-       GISTBufferingInsertStack *parent = child->parent;
-       OffsetNumber i,
-                               maxoff;
-       ItemId          iid;
-       IndexTuple      idxtuple;
+       BlockNumber parent;
        Buffer          buffer;
        Page            page;
-       bool            copied = false;
+       OffsetNumber maxoff;
+       OffsetNumber off;
 
-       buffer = ReadBuffer(indexrel, parent->blkno);
+       if (level > 0)
+               parent = gistGetParent(buildstate, childblkno);
+       else
+       {
+               /*
+                * For a leaf page, the caller must supply a correct parent block
+                * number.
+                */
+               if (*parentblkno == InvalidBlockNumber)
+                       elog(ERROR, "no parent buffer provided of child %d", childblkno);
+               parent = *parentblkno;
+       }
+
+       buffer = ReadBuffer(buildstate->indexrel, parent);
        page = BufferGetPage(buffer);
        LockBuffer(buffer, GIST_EXCLUSIVE);
-       gistcheckpage(indexrel, buffer);
+       gistcheckpage(buildstate->indexrel, buffer);
+       maxoff = PageGetMaxOffsetNumber(page);
 
        /* Check if it was not moved */
-       if (child->downlinkoffnum != InvalidOffsetNumber &&
-               child->downlinkoffnum <= PageGetMaxOffsetNumber(page))
+       if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
+               *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
        {
-               iid = PageGetItemId(page, child->downlinkoffnum);
-               idxtuple = (IndexTuple) PageGetItem(page, iid);
-               if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+               ItemId          iid = PageGetItemId(page, *downlinkoffnum);
+               IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+               if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
                {
                        /* Still there */
-                       UnlockReleaseBuffer(buffer);
-                       return;
+                       return buffer;
                }
        }
 
-       /* parent has changed, look child in right links until found */
-       while (true)
+       /*
+        * Downlink was not at the offset where it used to be. Scan the page to
+        * find it. During normal gist insertions, it might've moved to another
+        * page, to the right, but during a buffering build, we keep track of the
+        * parent of each page in the lookup table so we should always know what
+        * page it's on.
+        */
+       for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
        {
-               /* Search for relevant downlink in the current page */
-               maxoff = PageGetMaxOffsetNumber(page);
-               for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
-               {
-                       iid = PageGetItemId(page, i);
-                       idxtuple = (IndexTuple) PageGetItem(page, iid);
-                       if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
-                       {
-                               /* yes!!, found */
-                               child->downlinkoffnum = i;
-                               UnlockReleaseBuffer(buffer);
-                               return;
-                       }
-               }
-
-               /*
-                * We should copy parent path item because some other path items can
-                * refer to it.
-                */
-               if (!copied)
-               {
-                       parent = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
-                                                                                  sizeof(GISTBufferingInsertStack));
-                       memcpy(parent, child->parent, sizeof(GISTBufferingInsertStack));
-                       if (parent->parent)
-                               parent->parent->refCount++;
-                       gistDecreasePathRefcount(child->parent);
-                       child->parent = parent;
-                       parent->refCount = 1;
-                       copied = true;
-               }
+               ItemId          iid = PageGetItemId(page, off);
+               IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
 
-               /*
-                * Not found in current page. Move towards rightlink.
-                */
-               parent->blkno = GistPageGetOpaque(page)->rightlink;
-               UnlockReleaseBuffer(buffer);
-
-               if (parent->blkno == InvalidBlockNumber)
+               if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
                {
-                       /*
-                        * End of chain and still didn't find parent. Should not happen
-                        * during index build.
-                        */
-                       break;
+                       /* yes!!, found it */
+                       *downlinkoffnum = off;
+                       return buffer;
                }
-
-               /* Get the next page */
-               buffer = ReadBuffer(indexrel, parent->blkno);
-               page = BufferGetPage(buffer);
-               LockBuffer(buffer, GIST_EXCLUSIVE);
-               gistcheckpage(indexrel, buffer);
        }
 
-       elog(ERROR, "failed to re-find parent for block %u", child->blkno);
+       elog(ERROR, "failed to re-find parent for block %u", childblkno);
+       return InvalidBuffer;           /* keep compiler quiet */
 }
 
 /*
@@ -909,7 +971,7 @@ gistProcessEmptyingQueue(GISTBuildState *buildstate)
                         * threshold, but we might as well keep flushing tuples from it
                         * until we fill a lower-level buffer.
                         */
-                       if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->path))
+                       if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
                        {
                                /*
                                 * A lower level buffer filled up. Stop emptying this buffer,
@@ -978,45 +1040,11 @@ gistEmptyAllBuffers(GISTBuildState *buildstate)
                                gfbb->buffersOnLevels[i] =
                                        list_delete_first(gfbb->buffersOnLevels[i]);
                }
+               elog(DEBUG2, "emptied all buffers at level %d", i);
        }
        MemoryContextSwitchTo(oldCtx);
 }
 
-/*
- * Free unreferenced parts of a path stack.
- */
-static void
-gistFreeUnreferencedPath(GISTBufferingInsertStack *path)
-{
-       while (path->refCount == 0)
-       {
-               /*
-                * Path part is unreferenced. We can free it and decrease reference
-                * count of parent. If parent becomes unreferenced too procedure
-                * should be repeated for it.
-                */
-               GISTBufferingInsertStack *tmp = path->parent;
-
-               pfree(path);
-               path = tmp;
-               if (path)
-                       path->refCount--;
-               else
-                       break;
-       }
-}
-
-/*
- * Decrease reference count of a path part, and free any unreferenced parts of
- * the path stack.
- */
-void
-gistDecreasePathRefcount(GISTBufferingInsertStack *path)
-{
-       path->refCount--;
-       gistFreeUnreferencedPath(path);
-}
-
 /*
  * Get the depth of the GiST index.
  */
@@ -1060,15 +1088,119 @@ gistGetMaxLevel(Relation index)
                 * everywhere, so we just pick the first one.
                 */
                itup = (IndexTuple) PageGetItem(page,
-                                                                        PageGetItemId(page, FirstOffsetNumber));
+                                                                               PageGetItemId(page, FirstOffsetNumber));
                blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
                UnlockReleaseBuffer(buffer);
 
                /*
                 * We're going down on the tree. It means that there is yet one more
-                * level is the tree.
+                * level in the tree.
                 */
                maxLevel++;
        }
        return maxLevel;
 }
+
+
+/*
+ * Routines for managing the parent map.
+ *
+ * Whenever a page is split, we need to insert the downlinks into the parent.
+ * We need to somehow find the parent page to do that. In normal insertions,
+ * we keep a stack of nodes visited when we descend the tree. However, in
+ * buffering build, we can start descending the tree from any internal node,
+ * when we empty a buffer by cascading tuples to its children. So we don't
+ * have a full stack up to the root available at that time.
+ *
+ * So instead, we maintain a hash table to track the parent of every internal
+ * page. We don't need to track the parents of leaf nodes, however. Whenever
+ * we insert to a leaf, we've just descended down from its parent, so we know
+ * its immediate parent already. This helps a lot to limit the memory used
+ * by this hash table.
+ *
+ * Whenever an internal node is split, the parent map needs to be updated.
+ * the parent of the new child page needs to be recorded, and also the
+ * entries for all page whose downlinks are moved to a new page at the split
+ * needs to be updated.
+ *
+ * We also update the parent map whenever we descend the tree. That might seem
+ * unnecessary, because we maintain the map whenever a downlink is moved or
+ * created, but it is needed because we switch to buffering mode after
+ * creating a tree with regular index inserts. Any pages created before
+ * switching to buffering mode will not be present in the parent map initially,
+ * but will be added there the first time we visit them.
+ */
+
+typedef struct
+{
+       BlockNumber childblkno;         /* hash key */
+       BlockNumber parentblkno;
+} ParentMapEntry;
+
+static void
+gistInitParentMap(GISTBuildState *buildstate)
+{
+       HASHCTL         hashCtl;
+
+       hashCtl.keysize = sizeof(BlockNumber);
+       hashCtl.entrysize = sizeof(ParentMapEntry);
+       hashCtl.hcxt = CurrentMemoryContext;
+       buildstate->parentMap = hash_create("gistbuild parent map",
+                                                                               1024,
+                                                                               &hashCtl,
+                                                                               HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+}
+
+static void
+gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
+{
+       ParentMapEntry *entry;
+       bool            found;
+
+       entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
+                                                                                  (const void *) &child,
+                                                                                  HASH_ENTER,
+                                                                                  &found);
+       entry->parentblkno = parent;
+}
+
+/*
+ * Scan all downlinks on a page, and memorize their parent.
+ */
+static void
+gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
+{
+       OffsetNumber maxoff;
+       OffsetNumber off;
+       BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
+       Page            page = BufferGetPage(parentbuf);
+
+       Assert(!GistPageIsLeaf(page));
+
+       maxoff = PageGetMaxOffsetNumber(page);
+       for (off = FirstOffsetNumber; off <= maxoff; off++)
+       {
+               ItemId          iid = PageGetItemId(page, off);
+               IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
+               BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+               gistMemorizeParent(buildstate, childblkno, parentblkno);
+       }
+}
+
+static BlockNumber
+gistGetParent(GISTBuildState *buildstate, BlockNumber child)
+{
+       ParentMapEntry *entry;
+       bool            found;
+
+       /* Find node buffer in hash table */
+       entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
+                                                                                  (const void *) &child,
+                                                                                  HASH_FIND,
+                                                                                  &found);
+       if (!found)
+               elog(ERROR, "could not find parent of block %d in lookup table", child);
+
+       return entry->parentblkno;
+}