1 /*-------------------------------------------------------------------------
4 * build algorithm for GiST indexes implementation.
7 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gist/gistbuild.c
13 *-------------------------------------------------------------------------
19 #include "access/genam.h"
20 #include "access/gist_private.h"
21 #include "catalog/index.h"
22 #include "miscadmin.h"
23 #include "optimizer/cost.h"
24 #include "storage/bufmgr.h"
25 #include "storage/smgr.h"
26 #include "utils/memutils.h"
27 #include "utils/rel.h"
29 /* Step of index tuples for check whether to switch to buffering build mode */
30 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
33 * Number of tuples to process in the slow way before switching to buffering
34 * mode, when buffering is explicitly turned on. Also, the number of tuples
35 * to process between readjusting the buffer size parameter, while in
38 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
42 GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
44 GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
45 * buffering build mode if the index grows too
47 GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
48 * before switching to the buffering build
50 GIST_BUFFERING_ACTIVE /* in buffering build mode */
53 /* Working state for gistbuild and its callback */
58 GISTBuildBuffers *gfbb;
60 int64 indtuples; /* number of tuples indexed */
61 int64 indtuplesSize; /* total size of all indexed tuples */
63 Size freespace; /* amount of free space to leave on pages */
65 GistBufferingMode bufferingMode;
68 static void gistInitBuffering(GISTBuildState *buildstate);
69 static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
70 static void gistBuildCallback(Relation index,
76 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
78 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
79 GISTBufferingInsertStack *startparent);
80 static void gistbufferinginserttuples(GISTBuildState *buildstate,
82 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
83 GISTBufferingInsertStack *path);
84 static void gistBufferingFindCorrectParent(GISTBuildState *buildstate,
85 GISTBufferingInsertStack *child);
86 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
87 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
88 static void gistFreeUnreferencedPath(GISTBufferingInsertStack *path);
89 static int gistGetMaxLevel(Relation index);
93 * Main entry point to GiST index build. Initially calls insert over and over,
94 * but switches to more efficient buffering build algorithm after a certain
95 * number of tuples (unless buffering mode is disabled).
98 gistbuild(PG_FUNCTION_ARGS)
100 Relation heap = (Relation) PG_GETARG_POINTER(0);
101 Relation index = (Relation) PG_GETARG_POINTER(1);
102 IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
103 IndexBuildResult *result;
105 GISTBuildState buildstate;
108 MemoryContext oldcxt = CurrentMemoryContext;
111 buildstate.indexrel = index;
112 if (index->rd_options)
114 /* Get buffering mode from the options string */
115 GiSTOptions *options = (GiSTOptions *) index->rd_options;
116 char *bufferingMode = (char *) options + options->bufferingModeOffset;
118 if (strcmp(bufferingMode, "on") == 0)
119 buildstate.bufferingMode = GIST_BUFFERING_STATS;
120 else if (strcmp(bufferingMode, "off") == 0)
121 buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
123 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
125 fillfactor = options->fillfactor;
130 * By default, switch to buffering mode when the index grows too large
133 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
134 fillfactor = GIST_DEFAULT_FILLFACTOR;
136 /* Calculate target amount of free space to leave on pages */
137 buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
140 * We expect to be called exactly once for any index relation. If that's
141 * not the case, big trouble's what we have.
143 if (RelationGetNumberOfBlocks(index) != 0)
144 elog(ERROR, "index \"%s\" already contains data",
145 RelationGetRelationName(index));
148 * We can't yet handle unlogged GiST indexes, because we depend on LSNs.
149 * This is duplicative of an error in gistbuildempty, but we want to check
150 * here so as to throw error before doing all the index-build work.
152 if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
154 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
155 errmsg("unlogged GiST indexes are not supported")));
157 /* no locking is needed */
158 buildstate.giststate = initGISTstate(index);
161 * Create a temporary memory context that is reset once for each tuple
162 * processed. (Note: we don't bother to make this a child of the
163 * giststate's scanCxt, so we have to delete it separately at the end.)
165 buildstate.giststate->tempCxt = createTempGistContext();
167 /* initialize the root page */
168 buffer = gistNewBuffer(index);
169 Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
170 page = BufferGetPage(buffer);
172 START_CRIT_SECTION();
174 GISTInitBuffer(buffer, F_LEAF);
176 MarkBufferDirty(buffer);
178 if (RelationNeedsWAL(index))
183 rdata.data = (char *) &(index->rd_node);
184 rdata.len = sizeof(RelFileNode);
185 rdata.buffer = InvalidBuffer;
188 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
189 PageSetLSN(page, recptr);
190 PageSetTLI(page, ThisTimeLineID);
193 PageSetLSN(page, GetXLogRecPtrForTemp());
195 UnlockReleaseBuffer(buffer);
199 /* build the index */
200 buildstate.indtuples = 0;
201 buildstate.indtuplesSize = 0;
206 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
207 gistBuildCallback, (void *) &buildstate);
210 * If buffering was used, flush out all the tuples that are still in the
213 if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
215 elog(DEBUG1, "all tuples processed, emptying buffers");
216 gistEmptyAllBuffers(&buildstate);
217 gistFreeBuildBuffers(buildstate.gfbb);
220 /* okay, all heap tuples are indexed */
221 MemoryContextSwitchTo(oldcxt);
222 MemoryContextDelete(buildstate.giststate->tempCxt);
224 freeGISTstate(buildstate.giststate);
229 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
231 result->heap_tuples = reltuples;
232 result->index_tuples = (double) buildstate.indtuples;
234 PG_RETURN_POINTER(result);
238 * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
242 gistValidateBufferingOption(char *value)
245 (strcmp(value, "on") != 0 &&
246 strcmp(value, "off") != 0 &&
247 strcmp(value, "auto") != 0))
250 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
251 errmsg("invalid value for \"buffering\" option"),
252 errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
257 * Attempt to switch to buffering mode.
259 * If there is not enough memory for buffering build, sets bufferingMode
260 * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
261 * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
262 * GIST_BUFFERING_ACTIVE.
265 gistInitBuffering(GISTBuildState *buildstate)
267 Relation index = buildstate->indexrel;
272 double avgIndexTuplesPerPage,
273 maxIndexTuplesPerPage;
277 /* Calc space of index page which is available for index tuples */
278 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
280 - buildstate->freespace;
283 * Calculate average size of already inserted index tuples using gathered
286 itupAvgSize = (double) buildstate->indtuplesSize /
287 (double) buildstate->indtuples;
290 * Calculate minimal possible size of index tuple by index metadata.
291 * Minimal possible size of varlena is VARHDRSZ.
293 * XXX: that's not actually true, as a short varlen can be just 2 bytes.
294 * And we should take padding into account here.
296 itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
297 for (i = 0; i < index->rd_att->natts; i++)
299 if (index->rd_att->attrs[i]->attlen < 0)
300 itupMinSize += VARHDRSZ;
302 itupMinSize += index->rd_att->attrs[i]->attlen;
305 /* Calculate average and maximal number of index tuples which fit to page */
306 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
307 maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
310 * We need to calculate two parameters for the buffering algorithm:
311 * levelStep and pagesPerBuffer.
313 * levelStep determines the size of subtree that we operate on, while
314 * emptying a buffer. A higher value is better, as you need fewer buffer
315 * emptying steps to build the index. However, if you set it too high, the
316 * subtree doesn't fit in cache anymore, and you quickly lose the benefit
319 * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
320 * the number of tuples on page (ie. fanout), and M is the amount of
321 * internal memory available. Curiously, they doesn't explain *why* that
322 * setting is optimal. We calculate it by taking the highest levelStep so
323 * that a subtree still fits in cache. For a small B, our way of
324 * calculating levelStep is very close to Arge et al's formula. For a
325 * large B, our formula gives a value that is 2x higher.
327 * The average size (in pages) of a subtree of depth n can be calculated
328 * as a geometric series:
330 * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
332 * where B is the average number of index tuples on page. The subtree is
333 * cached in the shared buffer cache and the OS cache, so we choose
334 * levelStep so that the subtree size is comfortably smaller than
335 * effective_cache_size, with a safety factor of 4.
337 * The estimate on the average number of index tuples on page is based on
338 * average tuple sizes observed before switching to buffered build, so the
339 * real subtree size can be somewhat larger. Also, it would selfish to
340 * gobble the whole cache for our index build. The safety factor of 4
341 * should account for those effects.
343 * The other limiting factor for setting levelStep is that while
344 * processing a subtree, we need to hold one page for each buffer at the
345 * next lower buffered level. The max. number of buffers needed for that
346 * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
347 * hopefully maintenance_work_mem is set high enough that you're
348 * constrained by effective_cache_size rather than maintenance_work_mem.
350 * XXX: the buffer hash table consumes a fair amount of memory too per
351 * buffer, but that is not currently taken into account. That scales on
352 * the total number of buffers used, ie. the index size and on levelStep.
353 * Note that a higher levelStep *reduces* the amount of memory needed for
360 double maxlowestlevelpages;
362 /* size of an average subtree at this levelStep (in pages). */
364 (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
365 (1 - avgIndexTuplesPerPage);
367 /* max number of pages at the lowest level of a subtree */
368 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
370 /* subtree must fit in cache (with safety factor of 4) */
371 if (subtreesize > effective_cache_size / 4)
374 /* each node in the lowest level of a subtree has one page in memory */
375 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
378 /* Good, we can handle this levelStep. See if we can go one higher. */
383 * We just reached an unacceptable value of levelStep in previous loop.
384 * So, decrease levelStep to get last acceptable value.
389 * If there's not enough cache or maintenance_work_mem, fall back to plain
394 elog(DEBUG1, "failed to switch to buffered GiST build");
395 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
400 * The second parameter to set is pagesPerBuffer, which determines the
401 * size of each buffer. We adjust pagesPerBuffer also during the build,
402 * which is why this calculation is in a separate function.
404 pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
406 /* Initialize GISTBuildBuffers with these parameters */
407 buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
408 gistGetMaxLevel(index));
410 buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
412 elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
413 levelStep, pagesPerBuffer);
417 * Calculate pagesPerBuffer parameter for the buffering algorithm.
419 * Buffer size is chosen so that assuming that tuples are distributed
420 * randomly, emptying half a buffer fills on average one page in every buffer
421 * at the next lower level.
424 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
426 double pagesPerBuffer;
427 double avgIndexTuplesPerPage;
431 /* Calc space of index page which is available for index tuples */
432 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
434 - buildstate->freespace;
437 * Calculate average size of already inserted index tuples using gathered
440 itupAvgSize = (double) buildstate->indtuplesSize /
441 (double) buildstate->indtuples;
443 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
446 * Recalculate required size of buffers.
448 pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
450 return (int) rint(pagesPerBuffer);
454 * Per-tuple callback from IndexBuildHeapScan.
457 gistBuildCallback(Relation index,
464 GISTBuildState *buildstate = (GISTBuildState *) state;
466 MemoryContext oldCtx;
468 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
470 /* form an index tuple and point it at the heap tuple */
471 itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
472 itup->t_tid = htup->t_self;
474 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
476 /* We have buffers, so use them. */
477 gistBufferingBuildInsert(buildstate, itup);
482 * There's no buffers (yet). Since we already have the index relation
483 * locked, we call gistdoinsert directly.
485 gistdoinsert(index, itup, buildstate->freespace,
486 buildstate->giststate);
489 /* Update tuple count and total size. */
490 buildstate->indtuples += 1;
491 buildstate->indtuplesSize += IndexTupleSize(itup);
493 MemoryContextSwitchTo(oldCtx);
494 MemoryContextReset(buildstate->giststate->tempCxt);
496 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
497 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
499 /* Adjust the target buffer size now */
500 buildstate->gfbb->pagesPerBuffer =
501 calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
505 * In 'auto' mode, check if the index has grown too large to fit in cache,
506 * and switch to buffering mode if it has.
508 * To avoid excessive calls to smgrnblocks(), only check this every
509 * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
511 if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
512 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
513 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
514 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
515 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
518 * Index doesn't fit in effective cache anymore. Try to switch to
519 * buffering build mode.
521 gistInitBuffering(buildstate);
526 * Insert function for buffering index build.
529 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
531 /* Insert the tuple to buffers. */
532 gistProcessItup(buildstate, itup, NULL);
534 /* If we filled up (half of a) buffer, process buffer emptying. */
535 gistProcessEmptyingQueue(buildstate);
539 * Process an index tuple. Runs the tuple down the tree until we reach a leaf
540 * page or node buffer, and inserts the tuple there. Returns true if we have
541 * to stop buffer emptying process (because one of child buffers can't take
542 * index tuples anymore).
545 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
546 GISTBufferingInsertStack *startparent)
548 GISTSTATE *giststate = buildstate->giststate;
549 GISTBuildBuffers *gfbb = buildstate->gfbb;
550 Relation indexrel = buildstate->indexrel;
551 GISTBufferingInsertStack *path;
552 BlockNumber childblkno;
557 * NULL passed in startparent means that we start index tuple processing
561 path = gfbb->rootitem;
566 * Loop until we reach a leaf page (level == 0) or a level with buffers
567 * (not including the level we start at, because we would otherwise make
576 OffsetNumber childoffnum;
577 GISTBufferingInsertStack *parent;
579 /* Have we reached a level with buffers? */
580 if (LEVEL_HAS_BUFFERS(path->level, gfbb) && path != startparent)
583 /* Have we reached a leaf page? */
584 if (path->level == 0)
588 * Nope. Descend down to the next level then. Choose a child to
591 buffer = ReadBuffer(indexrel, path->blkno);
592 LockBuffer(buffer, GIST_EXCLUSIVE);
594 page = (Page) BufferGetPage(buffer);
595 childoffnum = gistchoose(indexrel, page, itup, giststate);
596 iid = PageGetItemId(page, childoffnum);
597 idxtuple = (IndexTuple) PageGetItem(page, iid);
598 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
601 * Check that the key representing the target child node is consistent
602 * with the key we're inserting. Update it if it's not.
604 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
606 gistbufferinginserttuples(buildstate, buffer, &newtup, 1,
608 UnlockReleaseBuffer(buffer);
610 /* Create new path item representing current page */
612 path = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
613 sizeof(GISTBufferingInsertStack));
614 path->parent = parent;
615 path->level = parent->level - 1;
616 path->blkno = childblkno;
617 path->downlinkoffnum = childoffnum;
618 path->refCount = 0; /* it's unreferenced for now */
620 /* Adjust reference count of parent */
625 if (LEVEL_HAS_BUFFERS(path->level, gfbb))
628 * We've reached level with buffers. Place the index tuple to the
629 * buffer, and add the buffer to the emptying queue if it overflows.
631 GISTNodeBuffer *childNodeBuffer;
633 /* Find the buffer or create a new one */
634 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, path->blkno,
635 path->downlinkoffnum, path->parent);
637 /* Add index tuple to it */
638 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
640 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
646 * We've reached a leaf page. Place the tuple here.
648 buffer = ReadBuffer(indexrel, path->blkno);
649 LockBuffer(buffer, GIST_EXCLUSIVE);
650 gistbufferinginserttuples(buildstate, buffer, &itup, 1,
651 InvalidOffsetNumber, path);
652 UnlockReleaseBuffer(buffer);
656 * Free unreferenced path items, if any. Path item may be referenced by
659 gistFreeUnreferencedPath(path);
665 * Insert tuples to a given page.
667 * This is analogous with gistinserttuples() in the regular insertion code.
670 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
671 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
672 GISTBufferingInsertStack *path)
674 GISTBuildBuffers *gfbb = buildstate->gfbb;
678 is_split = gistplacetopage(buildstate->indexrel,
679 buildstate->freespace,
680 buildstate->giststate,
682 itup, ntup, oldoffnum,
688 * If this is a root split, update the root path item kept in memory. This
689 * ensures that all path stacks are always complete, including all parent
690 * nodes up to the root. That simplifies the algorithm to re-find correct
693 if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
695 GISTBufferingInsertStack *oldroot = gfbb->rootitem;
696 Page page = BufferGetPage(buffer);
699 BlockNumber leftmostchild;
701 gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
702 gfbb->context, sizeof(GISTBufferingInsertStack));
703 gfbb->rootitem->parent = NULL;
704 gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
705 gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
706 gfbb->rootitem->level = oldroot->level + 1;
707 gfbb->rootitem->refCount = 1;
710 * All the downlinks on the old root page are now on one of the child
711 * pages. Change the block number of the old root entry in the stack
712 * to point to the leftmost child. The other child pages will be
713 * accessible from there by walking right.
715 iid = PageGetItemId(page, FirstOffsetNumber);
716 idxtuple = (IndexTuple) PageGetItem(page, iid);
717 leftmostchild = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
719 oldroot->parent = gfbb->rootitem;
720 oldroot->blkno = leftmostchild;
721 oldroot->downlinkoffnum = InvalidOffsetNumber;
727 * Insert the downlinks to the parent. This is analogous with
728 * gistfinishsplit() in the regular insertion code, but the locking is
729 * simpler, and we have to maintain the buffers.
731 IndexTuple *downlinks;
737 /* Parent may have changed since we memorized this path. */
738 gistBufferingFindCorrectParent(buildstate, path);
741 * If there's a buffer associated with this page, that needs to be
742 * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
743 * downlinks in 'splitinfo', to make sure they're consistent not only
744 * with the tuples already on the pages, but also the tuples in the
745 * buffers that will eventually be inserted to them.
747 gistRelocateBuildBuffersOnSplit(gfbb,
748 buildstate->giststate,
749 buildstate->indexrel,
750 path, buffer, splitinfo);
752 /* Create an array of all the downlink tuples */
753 ndownlinks = list_length(splitinfo);
754 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
756 foreach(lc, splitinfo)
758 GISTPageSplitInfo *splitinfo = lfirst(lc);
761 * Since there's no concurrent access, we can release the lower
762 * level buffers immediately. Don't release the buffer for the
763 * original page, though, because the caller will release that.
765 if (splitinfo->buf != buffer)
766 UnlockReleaseBuffer(splitinfo->buf);
767 downlinks[i++] = splitinfo->downlink;
770 /* Insert them into parent. */
771 parentBuffer = ReadBuffer(buildstate->indexrel, path->parent->blkno);
772 LockBuffer(parentBuffer, GIST_EXCLUSIVE);
773 gistbufferinginserttuples(buildstate, parentBuffer,
774 downlinks, ndownlinks,
775 path->downlinkoffnum, path->parent);
776 UnlockReleaseBuffer(parentBuffer);
778 list_free_deep(splitinfo); /* we don't need this anymore */
783 * Find correct parent by following rightlinks in buffering index build. This
784 * method of parent searching is possible because no concurrent activity is
785 * possible while index builds.
788 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
789 GISTBufferingInsertStack *child)
791 GISTBuildBuffers *gfbb = buildstate->gfbb;
792 Relation indexrel = buildstate->indexrel;
793 GISTBufferingInsertStack *parent = child->parent;
802 buffer = ReadBuffer(indexrel, parent->blkno);
803 page = BufferGetPage(buffer);
804 LockBuffer(buffer, GIST_EXCLUSIVE);
805 gistcheckpage(indexrel, buffer);
807 /* Check if it was not moved */
808 if (child->downlinkoffnum != InvalidOffsetNumber &&
809 child->downlinkoffnum <= PageGetMaxOffsetNumber(page))
811 iid = PageGetItemId(page, child->downlinkoffnum);
812 idxtuple = (IndexTuple) PageGetItem(page, iid);
813 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
816 UnlockReleaseBuffer(buffer);
821 /* parent has changed, look child in right links until found */
824 /* Search for relevant downlink in the current page */
825 maxoff = PageGetMaxOffsetNumber(page);
826 for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
828 iid = PageGetItemId(page, i);
829 idxtuple = (IndexTuple) PageGetItem(page, iid);
830 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
833 child->downlinkoffnum = i;
834 UnlockReleaseBuffer(buffer);
840 * We should copy parent path item because some other path items can
845 parent = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
846 sizeof(GISTBufferingInsertStack));
847 memcpy(parent, child->parent, sizeof(GISTBufferingInsertStack));
849 parent->parent->refCount++;
850 gistDecreasePathRefcount(child->parent);
851 child->parent = parent;
852 parent->refCount = 1;
857 * Not found in current page. Move towards rightlink.
859 parent->blkno = GistPageGetOpaque(page)->rightlink;
860 UnlockReleaseBuffer(buffer);
862 if (parent->blkno == InvalidBlockNumber)
865 * End of chain and still didn't find parent. Should not happen
866 * during index build.
871 /* Get the next page */
872 buffer = ReadBuffer(indexrel, parent->blkno);
873 page = BufferGetPage(buffer);
874 LockBuffer(buffer, GIST_EXCLUSIVE);
875 gistcheckpage(indexrel, buffer);
878 elog(ERROR, "failed to re-find parent for block %u", child->blkno);
882 * Process buffers emptying stack. Emptying of one buffer can cause emptying
883 * of other buffers. This function iterates until this cascading emptying
884 * process finished, e.g. until buffers emptying stack is empty.
887 gistProcessEmptyingQueue(GISTBuildState *buildstate)
889 GISTBuildBuffers *gfbb = buildstate->gfbb;
891 /* Iterate while we have elements in buffers emptying stack. */
892 while (gfbb->bufferEmptyingQueue != NIL)
894 GISTNodeBuffer *emptyingNodeBuffer;
896 /* Get node buffer from emptying stack. */
897 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
898 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
899 emptyingNodeBuffer->queuedForEmptying = false;
902 * We are going to load last pages of buffers where emptying will be
903 * to. So let's unload any previously loaded buffers.
905 gistUnloadNodeBuffers(gfbb);
908 * Pop tuples from the buffer and run them down to the buffers at
909 * lower level, or leaf pages. We continue until one of the lower
910 * level buffers fills up, or this buffer runs empty.
912 * In Arge et al's paper, the buffer emptying is stopped after
913 * processing 1/2 node buffer worth of tuples, to avoid overfilling
914 * any of the lower level buffers. However, it's more efficient to
915 * keep going until one of the lower level buffers actually fills up,
916 * so that's what we do. This doesn't need to be exact, if a buffer
917 * overfills by a few tuples, there's no harm done.
923 /* Get next index tuple from the buffer */
924 if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
928 * Run it down to the underlying node buffer or leaf page.
930 * Note: it's possible that the buffer we're emptying splits as a
931 * result of this call. If that happens, our emptyingNodeBuffer
932 * points to the left half of the split. After split, it's very
933 * likely that the new left buffer is no longer over the half-full
934 * threshold, but we might as well keep flushing tuples from it
935 * until we fill a lower-level buffer.
937 if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->path))
940 * A lower level buffer filled up. Stop emptying this buffer,
941 * to avoid overflowing the lower level buffer.
946 /* Free all the memory allocated during index tuple processing */
947 MemoryContextReset(buildstate->giststate->tempCxt);
953 * Empty all node buffers, from top to bottom. This is done at the end of
954 * index build to flush all remaining tuples to the index.
956 * Note: This destroys the buffersOnLevels lists, so the buffers should not
957 * be inserted to after this call.
960 gistEmptyAllBuffers(GISTBuildState *buildstate)
962 GISTBuildBuffers *gfbb = buildstate->gfbb;
963 MemoryContext oldCtx;
966 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
969 * Iterate through the levels from top to bottom.
971 for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
974 * Empty all buffers on this level. Note that new buffers can pop up
975 * in the list during the processing, as a result of page splits, so a
976 * simple walk through the list won't work. We remove buffers from the
977 * list when we see them empty; a buffer can't become non-empty once
978 * it's been fully emptied.
980 while (gfbb->buffersOnLevels[i] != NIL)
982 GISTNodeBuffer *nodeBuffer;
984 nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
986 if (nodeBuffer->blocksCount != 0)
989 * Add this buffer to the emptying queue, and proceed to empty
992 if (!nodeBuffer->queuedForEmptying)
994 MemoryContextSwitchTo(gfbb->context);
995 nodeBuffer->queuedForEmptying = true;
996 gfbb->bufferEmptyingQueue =
997 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
998 MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1000 gistProcessEmptyingQueue(buildstate);
1003 gfbb->buffersOnLevels[i] =
1004 list_delete_first(gfbb->buffersOnLevels[i]);
1007 MemoryContextSwitchTo(oldCtx);
1011 * Free unreferenced parts of a path stack.
1014 gistFreeUnreferencedPath(GISTBufferingInsertStack *path)
1016 while (path->refCount == 0)
1019 * Path part is unreferenced. We can free it and decrease reference
1020 * count of parent. If parent becomes unreferenced too procedure
1021 * should be repeated for it.
1023 GISTBufferingInsertStack *tmp = path->parent;
1035 * Decrease reference count of a path part, and free any unreferenced parts of
1039 gistDecreasePathRefcount(GISTBufferingInsertStack *path)
1042 gistFreeUnreferencedPath(path);
1046 * Get the depth of the GiST index.
1049 gistGetMaxLevel(Relation index)
1055 * Traverse down the tree, starting from the root, until we hit the leaf
1059 blkno = GIST_ROOT_BLKNO;
1066 buffer = ReadBuffer(index, blkno);
1069 * There's no concurrent access during index build, so locking is just
1072 LockBuffer(buffer, GIST_SHARE);
1073 page = (Page) BufferGetPage(buffer);
1075 if (GistPageIsLeaf(page))
1077 /* We hit the bottom, so we're done. */
1078 UnlockReleaseBuffer(buffer);
1083 * Pick the first downlink on the page, and follow it. It doesn't
1084 * matter which downlink we choose, the tree has the same depth
1085 * everywhere, so we just pick the first one.
1087 itup = (IndexTuple) PageGetItem(page,
1088 PageGetItemId(page, FirstOffsetNumber));
1089 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1090 UnlockReleaseBuffer(buffer);
1093 * We're going down on the tree. It means that there is yet one more
1094 * level is the tree.