1 /*-------------------------------------------------------------------------
4 * build algorithm for GiST indexes implementation.
7 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gist/gistbuild.c
13 *-------------------------------------------------------------------------
19 #include "access/genam.h"
20 #include "access/gist_private.h"
21 #include "catalog/index.h"
22 #include "miscadmin.h"
23 #include "optimizer/cost.h"
24 #include "storage/bufmgr.h"
25 #include "storage/smgr.h"
26 #include "utils/memutils.h"
27 #include "utils/rel.h"
29 /* Step of index tuples for check whether to switch to buffering build mode */
30 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
33 * Number of tuples to process in the slow way before switching to buffering
34 * mode, when buffering is explicitly turned on. Also, the number of tuples
35 * to process between readjusting the buffer size parameter, while in
38 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
42 GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
44 GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
45 * buffering build mode if the index grows too
47 GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
48 * before switching to the buffering build
50 GIST_BUFFERING_ACTIVE /* in buffering build mode */
53 /* Working state for gistbuild and its callback */
59 int64 indtuples; /* number of tuples indexed */
60 int64 indtuplesSize; /* total size of all indexed tuples */
62 Size freespace; /* amount of free space to leave on pages */
65 * Extra data structures used during a buffering build. 'gfbb' contains
66 * information related to managing the build buffers. 'parentMap' is a
67 * lookup table of the parent of each internal page.
69 GISTBuildBuffers *gfbb;
72 GistBufferingMode bufferingMode;
75 /* prototypes for private functions */
76 static void gistInitBuffering(GISTBuildState *buildstate);
77 static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
78 static void gistBuildCallback(Relation index,
84 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
86 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
87 BlockNumber startblkno, int startlevel);
88 static void gistbufferinginserttuples(GISTBuildState *buildstate,
89 Buffer buffer, int level,
90 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
91 BlockNumber parentblk, OffsetNumber downlinkoffnum);
92 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
93 BlockNumber childblkno, int level,
94 BlockNumber *parentblk,
95 OffsetNumber *downlinkoffnum);
96 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
97 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
98 static int gistGetMaxLevel(Relation index);
100 static void gistInitParentMap(GISTBuildState *buildstate);
101 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
103 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
104 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
107 * Main entry point to GiST index build. Initially calls insert over and over,
108 * but switches to more efficient buffering build algorithm after a certain
109 * number of tuples (unless buffering mode is disabled).
112 gistbuild(PG_FUNCTION_ARGS)
114 Relation heap = (Relation) PG_GETARG_POINTER(0);
115 Relation index = (Relation) PG_GETARG_POINTER(1);
116 IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
117 IndexBuildResult *result;
119 GISTBuildState buildstate;
122 MemoryContext oldcxt = CurrentMemoryContext;
125 buildstate.indexrel = index;
126 if (index->rd_options)
128 /* Get buffering mode from the options string */
129 GiSTOptions *options = (GiSTOptions *) index->rd_options;
130 char *bufferingMode = (char *) options + options->bufferingModeOffset;
132 if (strcmp(bufferingMode, "on") == 0)
133 buildstate.bufferingMode = GIST_BUFFERING_STATS;
134 else if (strcmp(bufferingMode, "off") == 0)
135 buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
137 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
139 fillfactor = options->fillfactor;
144 * By default, switch to buffering mode when the index grows too large
147 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
148 fillfactor = GIST_DEFAULT_FILLFACTOR;
150 /* Calculate target amount of free space to leave on pages */
151 buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
154 * We expect to be called exactly once for any index relation. If that's
155 * not the case, big trouble's what we have.
157 if (RelationGetNumberOfBlocks(index) != 0)
158 elog(ERROR, "index \"%s\" already contains data",
159 RelationGetRelationName(index));
162 * We can't yet handle unlogged GiST indexes, because we depend on LSNs.
163 * This is duplicative of an error in gistbuildempty, but we want to check
164 * here so as to throw error before doing all the index-build work.
166 if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
168 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
169 errmsg("unlogged GiST indexes are not supported")));
171 /* no locking is needed */
172 buildstate.giststate = initGISTstate(index);
175 * Create a temporary memory context that is reset once for each tuple
176 * processed. (Note: we don't bother to make this a child of the
177 * giststate's scanCxt, so we have to delete it separately at the end.)
179 buildstate.giststate->tempCxt = createTempGistContext();
181 /* initialize the root page */
182 buffer = gistNewBuffer(index);
183 Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
184 page = BufferGetPage(buffer);
186 START_CRIT_SECTION();
188 GISTInitBuffer(buffer, F_LEAF);
190 MarkBufferDirty(buffer);
192 if (RelationNeedsWAL(index))
197 rdata.data = (char *) &(index->rd_node);
198 rdata.len = sizeof(RelFileNode);
199 rdata.buffer = InvalidBuffer;
202 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
203 PageSetLSN(page, recptr);
204 PageSetTLI(page, ThisTimeLineID);
207 PageSetLSN(page, GetXLogRecPtrForTemp());
209 UnlockReleaseBuffer(buffer);
213 /* build the index */
214 buildstate.indtuples = 0;
215 buildstate.indtuplesSize = 0;
220 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
221 gistBuildCallback, (void *) &buildstate);
224 * If buffering was used, flush out all the tuples that are still in the
227 if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
229 elog(DEBUG1, "all tuples processed, emptying buffers");
230 gistEmptyAllBuffers(&buildstate);
231 gistFreeBuildBuffers(buildstate.gfbb);
234 /* okay, all heap tuples are indexed */
235 MemoryContextSwitchTo(oldcxt);
236 MemoryContextDelete(buildstate.giststate->tempCxt);
238 freeGISTstate(buildstate.giststate);
243 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
245 result->heap_tuples = reltuples;
246 result->index_tuples = (double) buildstate.indtuples;
248 PG_RETURN_POINTER(result);
252 * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
256 gistValidateBufferingOption(char *value)
259 (strcmp(value, "on") != 0 &&
260 strcmp(value, "off") != 0 &&
261 strcmp(value, "auto") != 0))
264 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
265 errmsg("invalid value for \"buffering\" option"),
266 errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
271 * Attempt to switch to buffering mode.
273 * If there is not enough memory for buffering build, sets bufferingMode
274 * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
275 * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
276 * GIST_BUFFERING_ACTIVE.
279 gistInitBuffering(GISTBuildState *buildstate)
281 Relation index = buildstate->indexrel;
286 double avgIndexTuplesPerPage,
287 maxIndexTuplesPerPage;
291 /* Calc space of index page which is available for index tuples */
292 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
294 - buildstate->freespace;
297 * Calculate average size of already inserted index tuples using gathered
300 itupAvgSize = (double) buildstate->indtuplesSize /
301 (double) buildstate->indtuples;
304 * Calculate minimal possible size of index tuple by index metadata.
305 * Minimal possible size of varlena is VARHDRSZ.
307 * XXX: that's not actually true, as a short varlen can be just 2 bytes.
308 * And we should take padding into account here.
310 itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
311 for (i = 0; i < index->rd_att->natts; i++)
313 if (index->rd_att->attrs[i]->attlen < 0)
314 itupMinSize += VARHDRSZ;
316 itupMinSize += index->rd_att->attrs[i]->attlen;
319 /* Calculate average and maximal number of index tuples which fit to page */
320 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
321 maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
324 * We need to calculate two parameters for the buffering algorithm:
325 * levelStep and pagesPerBuffer.
327 * levelStep determines the size of subtree that we operate on, while
328 * emptying a buffer. A higher value is better, as you need fewer buffer
329 * emptying steps to build the index. However, if you set it too high, the
330 * subtree doesn't fit in cache anymore, and you quickly lose the benefit
333 * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
334 * the number of tuples on page (ie. fanout), and M is the amount of
335 * internal memory available. Curiously, they doesn't explain *why* that
336 * setting is optimal. We calculate it by taking the highest levelStep so
337 * that a subtree still fits in cache. For a small B, our way of
338 * calculating levelStep is very close to Arge et al's formula. For a
339 * large B, our formula gives a value that is 2x higher.
341 * The average size (in pages) of a subtree of depth n can be calculated
342 * as a geometric series:
344 * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
346 * where B is the average number of index tuples on page. The subtree is
347 * cached in the shared buffer cache and the OS cache, so we choose
348 * levelStep so that the subtree size is comfortably smaller than
349 * effective_cache_size, with a safety factor of 4.
351 * The estimate on the average number of index tuples on page is based on
352 * average tuple sizes observed before switching to buffered build, so the
353 * real subtree size can be somewhat larger. Also, it would selfish to
354 * gobble the whole cache for our index build. The safety factor of 4
355 * should account for those effects.
357 * The other limiting factor for setting levelStep is that while
358 * processing a subtree, we need to hold one page for each buffer at the
359 * next lower buffered level. The max. number of buffers needed for that
360 * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
361 * hopefully maintenance_work_mem is set high enough that you're
362 * constrained by effective_cache_size rather than maintenance_work_mem.
364 * XXX: the buffer hash table consumes a fair amount of memory too per
365 * buffer, but that is not currently taken into account. That scales on
366 * the total number of buffers used, ie. the index size and on levelStep.
367 * Note that a higher levelStep *reduces* the amount of memory needed for
374 double maxlowestlevelpages;
376 /* size of an average subtree at this levelStep (in pages). */
378 (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
379 (1 - avgIndexTuplesPerPage);
381 /* max number of pages at the lowest level of a subtree */
382 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
384 /* subtree must fit in cache (with safety factor of 4) */
385 if (subtreesize > effective_cache_size / 4)
388 /* each node in the lowest level of a subtree has one page in memory */
389 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
392 /* Good, we can handle this levelStep. See if we can go one higher. */
397 * We just reached an unacceptable value of levelStep in previous loop.
398 * So, decrease levelStep to get last acceptable value.
403 * If there's not enough cache or maintenance_work_mem, fall back to plain
408 elog(DEBUG1, "failed to switch to buffered GiST build");
409 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
414 * The second parameter to set is pagesPerBuffer, which determines the
415 * size of each buffer. We adjust pagesPerBuffer also during the build,
416 * which is why this calculation is in a separate function.
418 pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
420 /* Initialize GISTBuildBuffers with these parameters */
421 buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
422 gistGetMaxLevel(index));
424 gistInitParentMap(buildstate);
426 buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
428 elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
429 levelStep, pagesPerBuffer);
433 * Calculate pagesPerBuffer parameter for the buffering algorithm.
435 * Buffer size is chosen so that assuming that tuples are distributed
436 * randomly, emptying half a buffer fills on average one page in every buffer
437 * at the next lower level.
440 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
442 double pagesPerBuffer;
443 double avgIndexTuplesPerPage;
447 /* Calc space of index page which is available for index tuples */
448 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
450 - buildstate->freespace;
453 * Calculate average size of already inserted index tuples using gathered
456 itupAvgSize = (double) buildstate->indtuplesSize /
457 (double) buildstate->indtuples;
459 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
462 * Recalculate required size of buffers.
464 pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
466 return (int) rint(pagesPerBuffer);
470 * Per-tuple callback from IndexBuildHeapScan.
473 gistBuildCallback(Relation index,
480 GISTBuildState *buildstate = (GISTBuildState *) state;
482 MemoryContext oldCtx;
484 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
486 /* form an index tuple and point it at the heap tuple */
487 itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
488 itup->t_tid = htup->t_self;
490 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
492 /* We have buffers, so use them. */
493 gistBufferingBuildInsert(buildstate, itup);
498 * There's no buffers (yet). Since we already have the index relation
499 * locked, we call gistdoinsert directly.
501 gistdoinsert(index, itup, buildstate->freespace,
502 buildstate->giststate);
505 /* Update tuple count and total size. */
506 buildstate->indtuples += 1;
507 buildstate->indtuplesSize += IndexTupleSize(itup);
509 MemoryContextSwitchTo(oldCtx);
510 MemoryContextReset(buildstate->giststate->tempCxt);
512 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
513 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
515 /* Adjust the target buffer size now */
516 buildstate->gfbb->pagesPerBuffer =
517 calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
521 * In 'auto' mode, check if the index has grown too large to fit in cache,
522 * and switch to buffering mode if it has.
524 * To avoid excessive calls to smgrnblocks(), only check this every
525 * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
527 if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
528 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
529 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
530 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
531 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
534 * Index doesn't fit in effective cache anymore. Try to switch to
535 * buffering build mode.
537 gistInitBuffering(buildstate);
542 * Insert function for buffering index build.
545 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
547 /* Insert the tuple to buffers. */
548 gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
550 /* If we filled up (half of a) buffer, process buffer emptying. */
551 gistProcessEmptyingQueue(buildstate);
555 * Process an index tuple. Runs the tuple down the tree until we reach a leaf
556 * page or node buffer, and inserts the tuple there. Returns true if we have
557 * to stop buffer emptying process (because one of child buffers can't take
558 * index tuples anymore).
561 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
562 BlockNumber startblkno, int startlevel)
564 GISTSTATE *giststate = buildstate->giststate;
565 GISTBuildBuffers *gfbb = buildstate->gfbb;
566 Relation indexrel = buildstate->indexrel;
567 BlockNumber childblkno;
572 OffsetNumber downlinkoffnum = InvalidOffsetNumber;
573 BlockNumber parentblkno = InvalidBlockNumber;
575 CHECK_FOR_INTERRUPTS();
578 * Loop until we reach a leaf page (level == 0) or a level with buffers
579 * (not including the level we start at, because we would otherwise make
590 OffsetNumber childoffnum;
592 /* Have we reached a level with buffers? */
593 if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
596 /* Have we reached a leaf page? */
601 * Nope. Descend down to the next level then. Choose a child to
605 buffer = ReadBuffer(indexrel, blkno);
606 LockBuffer(buffer, GIST_EXCLUSIVE);
608 page = (Page) BufferGetPage(buffer);
609 childoffnum = gistchoose(indexrel, page, itup, giststate);
610 iid = PageGetItemId(page, childoffnum);
611 idxtuple = (IndexTuple) PageGetItem(page, iid);
612 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
615 gistMemorizeParent(buildstate, childblkno, blkno);
618 * Check that the key representing the target child node is consistent
619 * with the key we're inserting. Update it if it's not.
621 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
624 gistbufferinginserttuples(buildstate, buffer, level,
625 &newtup, 1, childoffnum,
626 InvalidBlockNumber, InvalidOffsetNumber);
627 /* gistbufferinginserttuples() released the buffer */
630 UnlockReleaseBuffer(buffer);
632 /* Descend to the child */
635 downlinkoffnum = childoffnum;
640 if (LEVEL_HAS_BUFFERS(level, gfbb))
643 * We've reached level with buffers. Place the index tuple to the
644 * buffer, and add the buffer to the emptying queue if it overflows.
646 GISTNodeBuffer *childNodeBuffer;
648 /* Find the buffer or create a new one */
649 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
651 /* Add index tuple to it */
652 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
654 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
660 * We've reached a leaf page. Place the tuple here.
663 buffer = ReadBuffer(indexrel, blkno);
664 LockBuffer(buffer, GIST_EXCLUSIVE);
665 gistbufferinginserttuples(buildstate, buffer, level,
666 &itup, 1, InvalidOffsetNumber,
667 parentblkno, downlinkoffnum);
668 /* gistbufferinginserttuples() released the buffer */
675 * Insert tuples to a given page.
677 * This is analogous with gistinserttuples() in the regular insertion code.
679 * Caller should hold a lock on 'buffer' on entry. This function will unlock
683 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
684 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
685 BlockNumber parentblk, OffsetNumber downlinkoffnum)
687 GISTBuildBuffers *gfbb = buildstate->gfbb;
691 is_split = gistplacetopage(buildstate->indexrel,
692 buildstate->freespace,
693 buildstate->giststate,
695 itup, ntup, oldoffnum,
701 * If this is a root split, update the root path item kept in memory. This
702 * ensures that all path stacks are always complete, including all parent
703 * nodes up to the root. That simplifies the algorithm to re-find correct
706 if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
708 Page page = BufferGetPage(buffer);
712 Assert(level == gfbb->rootlevel);
715 elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
718 * All the downlinks on the old root page are now on one of the child
719 * pages. Visit all the new child pages to memorize the parents of the
722 if (gfbb->rootlevel > 1)
724 maxoff = PageGetMaxOffsetNumber(page);
725 for (off = FirstOffsetNumber; off <= maxoff; off++)
727 ItemId iid = PageGetItemId(page, off);
728 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
729 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
730 Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
732 LockBuffer(childbuf, GIST_SHARE);
733 gistMemorizeAllDownlinks(buildstate, childbuf);
734 UnlockReleaseBuffer(childbuf);
737 * Also remember that the parent of the new child page is the
740 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
748 * Insert the downlinks to the parent. This is analogous with
749 * gistfinishsplit() in the regular insertion code, but the locking is
750 * simpler, and we have to maintain the buffers on internal nodes and
753 IndexTuple *downlinks;
759 /* Parent may have changed since we memorized this path. */
761 gistBufferingFindCorrectParent(buildstate,
762 BufferGetBlockNumber(buffer),
768 * If there's a buffer associated with this page, that needs to be
769 * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
770 * downlinks in 'splitinfo', to make sure they're consistent not only
771 * with the tuples already on the pages, but also the tuples in the
772 * buffers that will eventually be inserted to them.
774 gistRelocateBuildBuffersOnSplit(gfbb,
775 buildstate->giststate,
776 buildstate->indexrel,
780 /* Create an array of all the downlink tuples */
781 ndownlinks = list_length(splitinfo);
782 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
784 foreach(lc, splitinfo)
786 GISTPageSplitInfo *splitinfo = lfirst(lc);
789 * Remember the parent of each new child page in our parent map.
790 * This assumes that the downlinks fit on the parent page. If the
791 * parent page is split, too, when we recurse up to insert the
792 * downlinks, the recursive gistbufferinginserttuples() call will
793 * update the map again.
796 gistMemorizeParent(buildstate,
797 BufferGetBlockNumber(splitinfo->buf),
798 BufferGetBlockNumber(parentBuffer));
801 * Also update the parent map for all the downlinks that got moved
802 * to a different page. (actually this also loops through the
803 * downlinks that stayed on the original page, but it does no
807 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
810 * Since there's no concurrent access, we can release the lower
811 * level buffers immediately. This includes the original page.
813 UnlockReleaseBuffer(splitinfo->buf);
814 downlinks[i++] = splitinfo->downlink;
817 /* Insert them into parent. */
818 gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
819 downlinks, ndownlinks, downlinkoffnum,
820 InvalidBlockNumber, InvalidOffsetNumber);
822 list_free_deep(splitinfo); /* we don't need this anymore */
825 UnlockReleaseBuffer(buffer);
829 * Find the downlink pointing to a child page.
831 * 'childblkno' indicates the child page to find the parent for. 'level' is
832 * the level of the child. On entry, *parentblkno and *downlinkoffnum can
833 * point to a location where the downlink used to be - we will check that
834 * location first, and save some cycles if it hasn't moved. The function
835 * returns a buffer containing the downlink, exclusively-locked, and
836 * *parentblkno and *downlinkoffnum are set to the real location of the
839 * If the child page is a leaf (level == 0), the caller must supply a correct
840 * parentblkno. Otherwise we use the parent map hash table to find the parent
843 * This function serves the same purpose as gistFindCorrectParent() during
844 * normal index inserts, but this is simpler because we don't need to deal
845 * with concurrent inserts.
848 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
849 BlockNumber childblkno, int level,
850 BlockNumber *parentblkno,
851 OffsetNumber *downlinkoffnum)
860 parent = gistGetParent(buildstate, childblkno);
864 * For a leaf page, the caller must supply a correct parent block
867 if (*parentblkno == InvalidBlockNumber)
868 elog(ERROR, "no parent buffer provided of child %d", childblkno);
869 parent = *parentblkno;
872 buffer = ReadBuffer(buildstate->indexrel, parent);
873 page = BufferGetPage(buffer);
874 LockBuffer(buffer, GIST_EXCLUSIVE);
875 gistcheckpage(buildstate->indexrel, buffer);
876 maxoff = PageGetMaxOffsetNumber(page);
878 /* Check if it was not moved */
879 if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
880 *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
882 ItemId iid = PageGetItemId(page, *downlinkoffnum);
883 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
885 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
893 * Downlink was not at the offset where it used to be. Scan the page to
894 * find it. During normal gist insertions, it might've moved to another
895 * page, to the right, but during a buffering build, we keep track of the
896 * parent of each page in the lookup table so we should always know what
899 for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
901 ItemId iid = PageGetItemId(page, off);
902 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
904 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
906 /* yes!!, found it */
907 *downlinkoffnum = off;
912 elog(ERROR, "failed to re-find parent for block %u", childblkno);
913 return InvalidBuffer; /* keep compiler quiet */
917 * Process buffers emptying stack. Emptying of one buffer can cause emptying
918 * of other buffers. This function iterates until this cascading emptying
919 * process finished, e.g. until buffers emptying stack is empty.
922 gistProcessEmptyingQueue(GISTBuildState *buildstate)
924 GISTBuildBuffers *gfbb = buildstate->gfbb;
926 /* Iterate while we have elements in buffers emptying stack. */
927 while (gfbb->bufferEmptyingQueue != NIL)
929 GISTNodeBuffer *emptyingNodeBuffer;
931 /* Get node buffer from emptying stack. */
932 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
933 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
934 emptyingNodeBuffer->queuedForEmptying = false;
937 * We are going to load last pages of buffers where emptying will be
938 * to. So let's unload any previously loaded buffers.
940 gistUnloadNodeBuffers(gfbb);
943 * Pop tuples from the buffer and run them down to the buffers at
944 * lower level, or leaf pages. We continue until one of the lower
945 * level buffers fills up, or this buffer runs empty.
947 * In Arge et al's paper, the buffer emptying is stopped after
948 * processing 1/2 node buffer worth of tuples, to avoid overfilling
949 * any of the lower level buffers. However, it's more efficient to
950 * keep going until one of the lower level buffers actually fills up,
951 * so that's what we do. This doesn't need to be exact, if a buffer
952 * overfills by a few tuples, there's no harm done.
958 /* Get next index tuple from the buffer */
959 if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
963 * Run it down to the underlying node buffer or leaf page.
965 * Note: it's possible that the buffer we're emptying splits as a
966 * result of this call. If that happens, our emptyingNodeBuffer
967 * points to the left half of the split. After split, it's very
968 * likely that the new left buffer is no longer over the half-full
969 * threshold, but we might as well keep flushing tuples from it
970 * until we fill a lower-level buffer.
972 if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
975 * A lower level buffer filled up. Stop emptying this buffer,
976 * to avoid overflowing the lower level buffer.
981 /* Free all the memory allocated during index tuple processing */
982 MemoryContextReset(buildstate->giststate->tempCxt);
988 * Empty all node buffers, from top to bottom. This is done at the end of
989 * index build to flush all remaining tuples to the index.
991 * Note: This destroys the buffersOnLevels lists, so the buffers should not
992 * be inserted to after this call.
995 gistEmptyAllBuffers(GISTBuildState *buildstate)
997 GISTBuildBuffers *gfbb = buildstate->gfbb;
998 MemoryContext oldCtx;
1001 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1004 * Iterate through the levels from top to bottom.
1006 for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1009 * Empty all buffers on this level. Note that new buffers can pop up
1010 * in the list during the processing, as a result of page splits, so a
1011 * simple walk through the list won't work. We remove buffers from the
1012 * list when we see them empty; a buffer can't become non-empty once
1013 * it's been fully emptied.
1015 while (gfbb->buffersOnLevels[i] != NIL)
1017 GISTNodeBuffer *nodeBuffer;
1019 nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1021 if (nodeBuffer->blocksCount != 0)
1024 * Add this buffer to the emptying queue, and proceed to empty
1027 if (!nodeBuffer->queuedForEmptying)
1029 MemoryContextSwitchTo(gfbb->context);
1030 nodeBuffer->queuedForEmptying = true;
1031 gfbb->bufferEmptyingQueue =
1032 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1033 MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1035 gistProcessEmptyingQueue(buildstate);
1038 gfbb->buffersOnLevels[i] =
1039 list_delete_first(gfbb->buffersOnLevels[i]);
1041 elog(DEBUG2, "emptied all buffers at level %d", i);
1043 MemoryContextSwitchTo(oldCtx);
1047 * Get the depth of the GiST index.
1050 gistGetMaxLevel(Relation index)
1056 * Traverse down the tree, starting from the root, until we hit the leaf
1060 blkno = GIST_ROOT_BLKNO;
1067 buffer = ReadBuffer(index, blkno);
1070 * There's no concurrent access during index build, so locking is just
1073 LockBuffer(buffer, GIST_SHARE);
1074 page = (Page) BufferGetPage(buffer);
1076 if (GistPageIsLeaf(page))
1078 /* We hit the bottom, so we're done. */
1079 UnlockReleaseBuffer(buffer);
1084 * Pick the first downlink on the page, and follow it. It doesn't
1085 * matter which downlink we choose, the tree has the same depth
1086 * everywhere, so we just pick the first one.
1088 itup = (IndexTuple) PageGetItem(page,
1089 PageGetItemId(page, FirstOffsetNumber));
1090 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1091 UnlockReleaseBuffer(buffer);
1094 * We're going down on the tree. It means that there is yet one more
1095 * level in the tree.
1104 * Routines for managing the parent map.
1106 * Whenever a page is split, we need to insert the downlinks into the parent.
1107 * We need to somehow find the parent page to do that. In normal insertions,
1108 * we keep a stack of nodes visited when we descend the tree. However, in
1109 * buffering build, we can start descending the tree from any internal node,
1110 * when we empty a buffer by cascading tuples to its children. So we don't
1111 * have a full stack up to the root available at that time.
1113 * So instead, we maintain a hash table to track the parent of every internal
1114 * page. We don't need to track the parents of leaf nodes, however. Whenever
1115 * we insert to a leaf, we've just descended down from its parent, so we know
1116 * its immediate parent already. This helps a lot to limit the memory used
1117 * by this hash table.
1119 * Whenever an internal node is split, the parent map needs to be updated.
1120 * the parent of the new child page needs to be recorded, and also the
1121 * entries for all page whose downlinks are moved to a new page at the split
1122 * needs to be updated.
1124 * We also update the parent map whenever we descend the tree. That might seem
1125 * unnecessary, because we maintain the map whenever a downlink is moved or
1126 * created, but it is needed because we switch to buffering mode after
1127 * creating a tree with regular index inserts. Any pages created before
1128 * switching to buffering mode will not be present in the parent map initially,
1129 * but will be added there the first time we visit them.
1134 BlockNumber childblkno; /* hash key */
1135 BlockNumber parentblkno;
1139 gistInitParentMap(GISTBuildState *buildstate)
1143 hashCtl.keysize = sizeof(BlockNumber);
1144 hashCtl.entrysize = sizeof(ParentMapEntry);
1145 hashCtl.hcxt = CurrentMemoryContext;
1146 hashCtl.hash = oid_hash;
1147 buildstate->parentMap = hash_create("gistbuild parent map",
1150 HASH_ELEM | HASH_CONTEXT
1155 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1157 ParentMapEntry *entry;
1160 entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1161 (const void *) &child,
1164 entry->parentblkno = parent;
1168 * Scan all downlinks on a page, and memorize their parent.
1171 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1173 OffsetNumber maxoff;
1175 BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1176 Page page = BufferGetPage(parentbuf);
1178 Assert(!GistPageIsLeaf(page));
1180 maxoff = PageGetMaxOffsetNumber(page);
1181 for (off = FirstOffsetNumber; off <= maxoff; off++)
1183 ItemId iid = PageGetItemId(page, off);
1184 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1185 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1187 gistMemorizeParent(buildstate, childblkno, parentblkno);
1192 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1194 ParentMapEntry *entry;
1197 /* Find node buffer in hash table */
1198 entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1199 (const void *) &child,
1203 elog(ERROR, "could not find parent of block %d in lookup table", child);
1205 return entry->parentblkno;