]> granicus.if.org Git - postgresql/blob - src/backend/access/gist/gistbuild.c
Support parallel btree index builds.
[postgresql] / src / backend / access / gist / gistbuild.c
1 /*-------------------------------------------------------------------------
2  *
3  * gistbuild.c
4  *        build algorithm for GiST indexes implementation.
5  *
6  *
7  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *        src/backend/access/gist/gistbuild.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/genam.h"
20 #include "access/gist_private.h"
21 #include "access/gistxlog.h"
22 #include "access/xloginsert.h"
23 #include "catalog/index.h"
24 #include "miscadmin.h"
25 #include "optimizer/cost.h"
26 #include "storage/bufmgr.h"
27 #include "storage/smgr.h"
28 #include "utils/memutils.h"
29 #include "utils/rel.h"
30
31 /* Step of index tuples for check whether to switch to buffering build mode */
32 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
33
34 /*
35  * Number of tuples to process in the slow way before switching to buffering
36  * mode, when buffering is explicitly turned on. Also, the number of tuples
37  * to process between readjusting the buffer size parameter, while in
38  * buffering mode.
39  */
40 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
41
42 typedef enum
43 {
44         GIST_BUFFERING_DISABLED,        /* in regular build mode and aren't going to
45                                                                  * switch */
46         GIST_BUFFERING_AUTO,            /* in regular build mode, but will switch to
47                                                                  * buffering build mode if the index grows too
48                                                                  * big */
49         GIST_BUFFERING_STATS,           /* gathering statistics of index tuple size
50                                                                  * before switching to the buffering build
51                                                                  * mode */
52         GIST_BUFFERING_ACTIVE           /* in buffering build mode */
53 } GistBufferingMode;
54
55 /* Working state for gistbuild and its callback */
56 typedef struct
57 {
58         Relation        indexrel;
59         GISTSTATE  *giststate;
60
61         int64           indtuples;              /* number of tuples indexed */
62         int64           indtuplesSize;  /* total size of all indexed tuples */
63
64         Size            freespace;              /* amount of free space to leave on pages */
65
66         /*
67          * Extra data structures used during a buffering build. 'gfbb' contains
68          * information related to managing the build buffers. 'parentMap' is a
69          * lookup table of the parent of each internal page.
70          */
71         GISTBuildBuffers *gfbb;
72         HTAB       *parentMap;
73
74         GistBufferingMode bufferingMode;
75 } GISTBuildState;
76
77 /* prototypes for private functions */
78 static void gistInitBuffering(GISTBuildState *buildstate);
79 static int      calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
80 static void gistBuildCallback(Relation index,
81                                   HeapTuple htup,
82                                   Datum *values,
83                                   bool *isnull,
84                                   bool tupleIsAlive,
85                                   void *state);
86 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
87                                                  IndexTuple itup);
88 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
89                                 BlockNumber startblkno, int startlevel);
90 static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
91                                                   Buffer buffer, int level,
92                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
93                                                   BlockNumber parentblk, OffsetNumber downlinkoffnum);
94 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
95                                                            BlockNumber childblkno, int level,
96                                                            BlockNumber *parentblk,
97                                                            OffsetNumber *downlinkoffnum);
98 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
99 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
100 static int      gistGetMaxLevel(Relation index);
101
102 static void gistInitParentMap(GISTBuildState *buildstate);
103 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
104                                    BlockNumber parent);
105 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
106 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
107
108 /*
109  * Main entry point to GiST index build. Initially calls insert over and over,
110  * but switches to more efficient buffering build algorithm after a certain
111  * number of tuples (unless buffering mode is disabled).
112  */
113 IndexBuildResult *
114 gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
115 {
116         IndexBuildResult *result;
117         double          reltuples;
118         GISTBuildState buildstate;
119         Buffer          buffer;
120         Page            page;
121         MemoryContext oldcxt = CurrentMemoryContext;
122         int                     fillfactor;
123
124         buildstate.indexrel = index;
125         if (index->rd_options)
126         {
127                 /* Get buffering mode from the options string */
128                 GiSTOptions *options = (GiSTOptions *) index->rd_options;
129                 char       *bufferingMode = (char *) options + options->bufferingModeOffset;
130
131                 if (strcmp(bufferingMode, "on") == 0)
132                         buildstate.bufferingMode = GIST_BUFFERING_STATS;
133                 else if (strcmp(bufferingMode, "off") == 0)
134                         buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
135                 else
136                         buildstate.bufferingMode = GIST_BUFFERING_AUTO;
137
138                 fillfactor = options->fillfactor;
139         }
140         else
141         {
142                 /*
143                  * By default, switch to buffering mode when the index grows too large
144                  * to fit in cache.
145                  */
146                 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
147                 fillfactor = GIST_DEFAULT_FILLFACTOR;
148         }
149         /* Calculate target amount of free space to leave on pages */
150         buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
151
152         /*
153          * We expect to be called exactly once for any index relation. If that's
154          * not the case, big trouble's what we have.
155          */
156         if (RelationGetNumberOfBlocks(index) != 0)
157                 elog(ERROR, "index \"%s\" already contains data",
158                          RelationGetRelationName(index));
159
160         /* no locking is needed */
161         buildstate.giststate = initGISTstate(index);
162
163         /*
164          * Create a temporary memory context that is reset once for each tuple
165          * processed.  (Note: we don't bother to make this a child of the
166          * giststate's scanCxt, so we have to delete it separately at the end.)
167          */
168         buildstate.giststate->tempCxt = createTempGistContext();
169
170         /* initialize the root page */
171         buffer = gistNewBuffer(index);
172         Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
173         page = BufferGetPage(buffer);
174
175         START_CRIT_SECTION();
176
177         GISTInitBuffer(buffer, F_LEAF);
178
179         MarkBufferDirty(buffer);
180
181         if (RelationNeedsWAL(index))
182         {
183                 XLogRecPtr      recptr;
184
185                 XLogBeginInsert();
186                 XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
187
188                 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX);
189                 PageSetLSN(page, recptr);
190         }
191         else
192                 PageSetLSN(page, gistGetFakeLSN(heap));
193
194         UnlockReleaseBuffer(buffer);
195
196         END_CRIT_SECTION();
197
198         /* build the index */
199         buildstate.indtuples = 0;
200         buildstate.indtuplesSize = 0;
201
202         /*
203          * Do the heap scan.
204          */
205         reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
206                                                                    gistBuildCallback, (void *) &buildstate, NULL);
207
208         /*
209          * If buffering was used, flush out all the tuples that are still in the
210          * buffers.
211          */
212         if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
213         {
214                 elog(DEBUG1, "all tuples processed, emptying buffers");
215                 gistEmptyAllBuffers(&buildstate);
216                 gistFreeBuildBuffers(buildstate.gfbb);
217         }
218
219         /* okay, all heap tuples are indexed */
220         MemoryContextSwitchTo(oldcxt);
221         MemoryContextDelete(buildstate.giststate->tempCxt);
222
223         freeGISTstate(buildstate.giststate);
224
225         /*
226          * Return statistics
227          */
228         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
229
230         result->heap_tuples = reltuples;
231         result->index_tuples = (double) buildstate.indtuples;
232
233         return result;
234 }
235
236 /*
237  * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
238  * and "auto" values.
239  */
240 void
241 gistValidateBufferingOption(const char *value)
242 {
243         if (value == NULL ||
244                 (strcmp(value, "on") != 0 &&
245                  strcmp(value, "off") != 0 &&
246                  strcmp(value, "auto") != 0))
247         {
248                 ereport(ERROR,
249                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
250                                  errmsg("invalid value for \"buffering\" option"),
251                                  errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
252         }
253 }
254
255 /*
256  * Attempt to switch to buffering mode.
257  *
258  * If there is not enough memory for buffering build, sets bufferingMode
259  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
260  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
261  * GIST_BUFFERING_ACTIVE.
262  */
263 static void
264 gistInitBuffering(GISTBuildState *buildstate)
265 {
266         Relation        index = buildstate->indexrel;
267         int                     pagesPerBuffer;
268         Size            pageFreeSpace;
269         Size            itupAvgSize,
270                                 itupMinSize;
271         double          avgIndexTuplesPerPage,
272                                 maxIndexTuplesPerPage;
273         int                     i;
274         int                     levelStep;
275
276         /* Calc space of index page which is available for index tuples */
277         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
278                 - sizeof(ItemIdData)
279                 - buildstate->freespace;
280
281         /*
282          * Calculate average size of already inserted index tuples using gathered
283          * statistics.
284          */
285         itupAvgSize = (double) buildstate->indtuplesSize /
286                 (double) buildstate->indtuples;
287
288         /*
289          * Calculate minimal possible size of index tuple by index metadata.
290          * Minimal possible size of varlena is VARHDRSZ.
291          *
292          * XXX: that's not actually true, as a short varlen can be just 2 bytes.
293          * And we should take padding into account here.
294          */
295         itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
296         for (i = 0; i < index->rd_att->natts; i++)
297         {
298                 if (TupleDescAttr(index->rd_att, i)->attlen < 0)
299                         itupMinSize += VARHDRSZ;
300                 else
301                         itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
302         }
303
304         /* Calculate average and maximal number of index tuples which fit to page */
305         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
306         maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
307
308         /*
309          * We need to calculate two parameters for the buffering algorithm:
310          * levelStep and pagesPerBuffer.
311          *
312          * levelStep determines the size of subtree that we operate on, while
313          * emptying a buffer. A higher value is better, as you need fewer buffer
314          * emptying steps to build the index. However, if you set it too high, the
315          * subtree doesn't fit in cache anymore, and you quickly lose the benefit
316          * of the buffers.
317          *
318          * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
319          * the number of tuples on page (ie. fanout), and M is the amount of
320          * internal memory available. Curiously, they doesn't explain *why* that
321          * setting is optimal. We calculate it by taking the highest levelStep so
322          * that a subtree still fits in cache. For a small B, our way of
323          * calculating levelStep is very close to Arge et al's formula. For a
324          * large B, our formula gives a value that is 2x higher.
325          *
326          * The average size (in pages) of a subtree of depth n can be calculated
327          * as a geometric series:
328          *
329          * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
330          *
331          * where B is the average number of index tuples on page. The subtree is
332          * cached in the shared buffer cache and the OS cache, so we choose
333          * levelStep so that the subtree size is comfortably smaller than
334          * effective_cache_size, with a safety factor of 4.
335          *
336          * The estimate on the average number of index tuples on page is based on
337          * average tuple sizes observed before switching to buffered build, so the
338          * real subtree size can be somewhat larger. Also, it would selfish to
339          * gobble the whole cache for our index build. The safety factor of 4
340          * should account for those effects.
341          *
342          * The other limiting factor for setting levelStep is that while
343          * processing a subtree, we need to hold one page for each buffer at the
344          * next lower buffered level. The max. number of buffers needed for that
345          * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
346          * hopefully maintenance_work_mem is set high enough that you're
347          * constrained by effective_cache_size rather than maintenance_work_mem.
348          *
349          * XXX: the buffer hash table consumes a fair amount of memory too per
350          * buffer, but that is not currently taken into account. That scales on
351          * the total number of buffers used, ie. the index size and on levelStep.
352          * Note that a higher levelStep *reduces* the amount of memory needed for
353          * the hash table.
354          */
355         levelStep = 1;
356         for (;;)
357         {
358                 double          subtreesize;
359                 double          maxlowestlevelpages;
360
361                 /* size of an average subtree at this levelStep (in pages). */
362                 subtreesize =
363                         (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
364                         (1 - avgIndexTuplesPerPage);
365
366                 /* max number of pages at the lowest level of a subtree */
367                 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
368
369                 /* subtree must fit in cache (with safety factor of 4) */
370                 if (subtreesize > effective_cache_size / 4)
371                         break;
372
373                 /* each node in the lowest level of a subtree has one page in memory */
374                 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
375                         break;
376
377                 /* Good, we can handle this levelStep. See if we can go one higher. */
378                 levelStep++;
379         }
380
381         /*
382          * We just reached an unacceptable value of levelStep in previous loop.
383          * So, decrease levelStep to get last acceptable value.
384          */
385         levelStep--;
386
387         /*
388          * If there's not enough cache or maintenance_work_mem, fall back to plain
389          * inserts.
390          */
391         if (levelStep <= 0)
392         {
393                 elog(DEBUG1, "failed to switch to buffered GiST build");
394                 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
395                 return;
396         }
397
398         /*
399          * The second parameter to set is pagesPerBuffer, which determines the
400          * size of each buffer. We adjust pagesPerBuffer also during the build,
401          * which is why this calculation is in a separate function.
402          */
403         pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
404
405         /* Initialize GISTBuildBuffers with these parameters */
406         buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
407                                                                                         gistGetMaxLevel(index));
408
409         gistInitParentMap(buildstate);
410
411         buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
412
413         elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
414                  levelStep, pagesPerBuffer);
415 }
416
417 /*
418  * Calculate pagesPerBuffer parameter for the buffering algorithm.
419  *
420  * Buffer size is chosen so that assuming that tuples are distributed
421  * randomly, emptying half a buffer fills on average one page in every buffer
422  * at the next lower level.
423  */
424 static int
425 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
426 {
427         double          pagesPerBuffer;
428         double          avgIndexTuplesPerPage;
429         double          itupAvgSize;
430         Size            pageFreeSpace;
431
432         /* Calc space of index page which is available for index tuples */
433         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
434                 - sizeof(ItemIdData)
435                 - buildstate->freespace;
436
437         /*
438          * Calculate average size of already inserted index tuples using gathered
439          * statistics.
440          */
441         itupAvgSize = (double) buildstate->indtuplesSize /
442                 (double) buildstate->indtuples;
443
444         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
445
446         /*
447          * Recalculate required size of buffers.
448          */
449         pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
450
451         return (int) rint(pagesPerBuffer);
452 }
453
454 /*
455  * Per-tuple callback from IndexBuildHeapScan.
456  */
457 static void
458 gistBuildCallback(Relation index,
459                                   HeapTuple htup,
460                                   Datum *values,
461                                   bool *isnull,
462                                   bool tupleIsAlive,
463                                   void *state)
464 {
465         GISTBuildState *buildstate = (GISTBuildState *) state;
466         IndexTuple      itup;
467         MemoryContext oldCtx;
468
469         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
470
471         /* form an index tuple and point it at the heap tuple */
472         itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
473         itup->t_tid = htup->t_self;
474
475         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
476         {
477                 /* We have buffers, so use them. */
478                 gistBufferingBuildInsert(buildstate, itup);
479         }
480         else
481         {
482                 /*
483                  * There's no buffers (yet). Since we already have the index relation
484                  * locked, we call gistdoinsert directly.
485                  */
486                 gistdoinsert(index, itup, buildstate->freespace,
487                                          buildstate->giststate);
488         }
489
490         /* Update tuple count and total size. */
491         buildstate->indtuples += 1;
492         buildstate->indtuplesSize += IndexTupleSize(itup);
493
494         MemoryContextSwitchTo(oldCtx);
495         MemoryContextReset(buildstate->giststate->tempCxt);
496
497         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
498                 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
499         {
500                 /* Adjust the target buffer size now */
501                 buildstate->gfbb->pagesPerBuffer =
502                         calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
503         }
504
505         /*
506          * In 'auto' mode, check if the index has grown too large to fit in cache,
507          * and switch to buffering mode if it has.
508          *
509          * To avoid excessive calls to smgrnblocks(), only check this every
510          * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
511          */
512         if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
513                  buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
514                  effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
515                 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
516                  buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
517         {
518                 /*
519                  * Index doesn't fit in effective cache anymore. Try to switch to
520                  * buffering build mode.
521                  */
522                 gistInitBuffering(buildstate);
523         }
524 }
525
526 /*
527  * Insert function for buffering index build.
528  */
529 static void
530 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
531 {
532         /* Insert the tuple to buffers. */
533         gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
534
535         /* If we filled up (half of a) buffer, process buffer emptying. */
536         gistProcessEmptyingQueue(buildstate);
537 }
538
539 /*
540  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
541  * page or node buffer, and inserts the tuple there. Returns true if we have
542  * to stop buffer emptying process (because one of child buffers can't take
543  * index tuples anymore).
544  */
545 static bool
546 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
547                                 BlockNumber startblkno, int startlevel)
548 {
549         GISTSTATE  *giststate = buildstate->giststate;
550         GISTBuildBuffers *gfbb = buildstate->gfbb;
551         Relation        indexrel = buildstate->indexrel;
552         BlockNumber childblkno;
553         Buffer          buffer;
554         bool            result = false;
555         BlockNumber blkno;
556         int                     level;
557         OffsetNumber downlinkoffnum = InvalidOffsetNumber;
558         BlockNumber parentblkno = InvalidBlockNumber;
559
560         CHECK_FOR_INTERRUPTS();
561
562         /*
563          * Loop until we reach a leaf page (level == 0) or a level with buffers
564          * (not including the level we start at, because we would otherwise make
565          * no progress).
566          */
567         blkno = startblkno;
568         level = startlevel;
569         for (;;)
570         {
571                 ItemId          iid;
572                 IndexTuple      idxtuple,
573                                         newtup;
574                 Page            page;
575                 OffsetNumber childoffnum;
576
577                 /* Have we reached a level with buffers? */
578                 if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
579                         break;
580
581                 /* Have we reached a leaf page? */
582                 if (level == 0)
583                         break;
584
585                 /*
586                  * Nope. Descend down to the next level then. Choose a child to
587                  * descend down to.
588                  */
589
590                 buffer = ReadBuffer(indexrel, blkno);
591                 LockBuffer(buffer, GIST_EXCLUSIVE);
592
593                 page = (Page) BufferGetPage(buffer);
594                 childoffnum = gistchoose(indexrel, page, itup, giststate);
595                 iid = PageGetItemId(page, childoffnum);
596                 idxtuple = (IndexTuple) PageGetItem(page, iid);
597                 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
598
599                 if (level > 1)
600                         gistMemorizeParent(buildstate, childblkno, blkno);
601
602                 /*
603                  * Check that the key representing the target child node is consistent
604                  * with the key we're inserting. Update it if it's not.
605                  */
606                 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
607                 if (newtup)
608                 {
609                         blkno = gistbufferinginserttuples(buildstate,
610                                                                                           buffer,
611                                                                                           level,
612                                                                                           &newtup,
613                                                                                           1,
614                                                                                           childoffnum,
615                                                                                           InvalidBlockNumber,
616                                                                                           InvalidOffsetNumber);
617                         /* gistbufferinginserttuples() released the buffer */
618                 }
619                 else
620                         UnlockReleaseBuffer(buffer);
621
622                 /* Descend to the child */
623                 parentblkno = blkno;
624                 blkno = childblkno;
625                 downlinkoffnum = childoffnum;
626                 Assert(level > 0);
627                 level--;
628         }
629
630         if (LEVEL_HAS_BUFFERS(level, gfbb))
631         {
632                 /*
633                  * We've reached level with buffers. Place the index tuple to the
634                  * buffer, and add the buffer to the emptying queue if it overflows.
635                  */
636                 GISTNodeBuffer *childNodeBuffer;
637
638                 /* Find the buffer or create a new one */
639                 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
640
641                 /* Add index tuple to it */
642                 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
643
644                 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
645                         result = true;
646         }
647         else
648         {
649                 /*
650                  * We've reached a leaf page. Place the tuple here.
651                  */
652                 Assert(level == 0);
653                 buffer = ReadBuffer(indexrel, blkno);
654                 LockBuffer(buffer, GIST_EXCLUSIVE);
655                 gistbufferinginserttuples(buildstate, buffer, level,
656                                                                   &itup, 1, InvalidOffsetNumber,
657                                                                   parentblkno, downlinkoffnum);
658                 /* gistbufferinginserttuples() released the buffer */
659         }
660
661         return result;
662 }
663
664 /*
665  * Insert tuples to a given page.
666  *
667  * This is analogous with gistinserttuples() in the regular insertion code.
668  *
669  * Returns the block number of the page where the (first) new or updated tuple
670  * was inserted. Usually that's the original page, but might be a sibling page
671  * if the original page was split.
672  *
673  * Caller should hold a lock on 'buffer' on entry. This function will unlock
674  * and unpin it.
675  */
676 static BlockNumber
677 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
678                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
679                                                   BlockNumber parentblk, OffsetNumber downlinkoffnum)
680 {
681         GISTBuildBuffers *gfbb = buildstate->gfbb;
682         List       *splitinfo;
683         bool            is_split;
684         BlockNumber placed_to_blk = InvalidBlockNumber;
685
686         is_split = gistplacetopage(buildstate->indexrel,
687                                                            buildstate->freespace,
688                                                            buildstate->giststate,
689                                                            buffer,
690                                                            itup, ntup, oldoffnum, &placed_to_blk,
691                                                            InvalidBuffer,
692                                                            &splitinfo,
693                                                            false);
694
695         /*
696          * If this is a root split, update the root path item kept in memory. This
697          * ensures that all path stacks are always complete, including all parent
698          * nodes up to the root. That simplifies the algorithm to re-find correct
699          * parent.
700          */
701         if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
702         {
703                 Page            page = BufferGetPage(buffer);
704                 OffsetNumber off;
705                 OffsetNumber maxoff;
706
707                 Assert(level == gfbb->rootlevel);
708                 gfbb->rootlevel++;
709
710                 elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
711
712                 /*
713                  * All the downlinks on the old root page are now on one of the child
714                  * pages. Visit all the new child pages to memorize the parents of the
715                  * grandchildren.
716                  */
717                 if (gfbb->rootlevel > 1)
718                 {
719                         maxoff = PageGetMaxOffsetNumber(page);
720                         for (off = FirstOffsetNumber; off <= maxoff; off++)
721                         {
722                                 ItemId          iid = PageGetItemId(page, off);
723                                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
724                                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
725                                 Buffer          childbuf = ReadBuffer(buildstate->indexrel, childblkno);
726
727                                 LockBuffer(childbuf, GIST_SHARE);
728                                 gistMemorizeAllDownlinks(buildstate, childbuf);
729                                 UnlockReleaseBuffer(childbuf);
730
731                                 /*
732                                  * Also remember that the parent of the new child page is the
733                                  * root block.
734                                  */
735                                 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
736                         }
737                 }
738         }
739
740         if (splitinfo)
741         {
742                 /*
743                  * Insert the downlinks to the parent. This is analogous with
744                  * gistfinishsplit() in the regular insertion code, but the locking is
745                  * simpler, and we have to maintain the buffers on internal nodes and
746                  * the parent map.
747                  */
748                 IndexTuple *downlinks;
749                 int                     ndownlinks,
750                                         i;
751                 Buffer          parentBuffer;
752                 ListCell   *lc;
753
754                 /* Parent may have changed since we memorized this path. */
755                 parentBuffer =
756                         gistBufferingFindCorrectParent(buildstate,
757                                                                                    BufferGetBlockNumber(buffer),
758                                                                                    level,
759                                                                                    &parentblk,
760                                                                                    &downlinkoffnum);
761
762                 /*
763                  * If there's a buffer associated with this page, that needs to be
764                  * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
765                  * downlinks in 'splitinfo', to make sure they're consistent not only
766                  * with the tuples already on the pages, but also the tuples in the
767                  * buffers that will eventually be inserted to them.
768                  */
769                 gistRelocateBuildBuffersOnSplit(gfbb,
770                                                                                 buildstate->giststate,
771                                                                                 buildstate->indexrel,
772                                                                                 level,
773                                                                                 buffer, splitinfo);
774
775                 /* Create an array of all the downlink tuples */
776                 ndownlinks = list_length(splitinfo);
777                 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
778                 i = 0;
779                 foreach(lc, splitinfo)
780                 {
781                         GISTPageSplitInfo *splitinfo = lfirst(lc);
782
783                         /*
784                          * Remember the parent of each new child page in our parent map.
785                          * This assumes that the downlinks fit on the parent page. If the
786                          * parent page is split, too, when we recurse up to insert the
787                          * downlinks, the recursive gistbufferinginserttuples() call will
788                          * update the map again.
789                          */
790                         if (level > 0)
791                                 gistMemorizeParent(buildstate,
792                                                                    BufferGetBlockNumber(splitinfo->buf),
793                                                                    BufferGetBlockNumber(parentBuffer));
794
795                         /*
796                          * Also update the parent map for all the downlinks that got moved
797                          * to a different page. (actually this also loops through the
798                          * downlinks that stayed on the original page, but it does no
799                          * harm).
800                          */
801                         if (level > 1)
802                                 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
803
804                         /*
805                          * Since there's no concurrent access, we can release the lower
806                          * level buffers immediately. This includes the original page.
807                          */
808                         UnlockReleaseBuffer(splitinfo->buf);
809                         downlinks[i++] = splitinfo->downlink;
810                 }
811
812                 /* Insert them into parent. */
813                 gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
814                                                                   downlinks, ndownlinks, downlinkoffnum,
815                                                                   InvalidBlockNumber, InvalidOffsetNumber);
816
817                 list_free_deep(splitinfo);      /* we don't need this anymore */
818         }
819         else
820                 UnlockReleaseBuffer(buffer);
821
822         return placed_to_blk;
823 }
824
825 /*
826  * Find the downlink pointing to a child page.
827  *
828  * 'childblkno' indicates the child page to find the parent for. 'level' is
829  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
830  * point to a location where the downlink used to be - we will check that
831  * location first, and save some cycles if it hasn't moved. The function
832  * returns a buffer containing the downlink, exclusively-locked, and
833  * *parentblkno and *downlinkoffnum are set to the real location of the
834  * downlink.
835  *
836  * If the child page is a leaf (level == 0), the caller must supply a correct
837  * parentblkno. Otherwise we use the parent map hash table to find the parent
838  * block.
839  *
840  * This function serves the same purpose as gistFindCorrectParent() during
841  * normal index inserts, but this is simpler because we don't need to deal
842  * with concurrent inserts.
843  */
844 static Buffer
845 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
846                                                            BlockNumber childblkno, int level,
847                                                            BlockNumber *parentblkno,
848                                                            OffsetNumber *downlinkoffnum)
849 {
850         BlockNumber parent;
851         Buffer          buffer;
852         Page            page;
853         OffsetNumber maxoff;
854         OffsetNumber off;
855
856         if (level > 0)
857                 parent = gistGetParent(buildstate, childblkno);
858         else
859         {
860                 /*
861                  * For a leaf page, the caller must supply a correct parent block
862                  * number.
863                  */
864                 if (*parentblkno == InvalidBlockNumber)
865                         elog(ERROR, "no parent buffer provided of child %d", childblkno);
866                 parent = *parentblkno;
867         }
868
869         buffer = ReadBuffer(buildstate->indexrel, parent);
870         page = BufferGetPage(buffer);
871         LockBuffer(buffer, GIST_EXCLUSIVE);
872         gistcheckpage(buildstate->indexrel, buffer);
873         maxoff = PageGetMaxOffsetNumber(page);
874
875         /* Check if it was not moved */
876         if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
877                 *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
878         {
879                 ItemId          iid = PageGetItemId(page, *downlinkoffnum);
880                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
881
882                 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
883                 {
884                         /* Still there */
885                         return buffer;
886                 }
887         }
888
889         /*
890          * Downlink was not at the offset where it used to be. Scan the page to
891          * find it. During normal gist insertions, it might've moved to another
892          * page, to the right, but during a buffering build, we keep track of the
893          * parent of each page in the lookup table so we should always know what
894          * page it's on.
895          */
896         for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
897         {
898                 ItemId          iid = PageGetItemId(page, off);
899                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
900
901                 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
902                 {
903                         /* yes!!, found it */
904                         *downlinkoffnum = off;
905                         return buffer;
906                 }
907         }
908
909         elog(ERROR, "failed to re-find parent for block %u", childblkno);
910         return InvalidBuffer;           /* keep compiler quiet */
911 }
912
913 /*
914  * Process buffers emptying stack. Emptying of one buffer can cause emptying
915  * of other buffers. This function iterates until this cascading emptying
916  * process finished, e.g. until buffers emptying stack is empty.
917  */
918 static void
919 gistProcessEmptyingQueue(GISTBuildState *buildstate)
920 {
921         GISTBuildBuffers *gfbb = buildstate->gfbb;
922
923         /* Iterate while we have elements in buffers emptying stack. */
924         while (gfbb->bufferEmptyingQueue != NIL)
925         {
926                 GISTNodeBuffer *emptyingNodeBuffer;
927
928                 /* Get node buffer from emptying stack. */
929                 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
930                 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
931                 emptyingNodeBuffer->queuedForEmptying = false;
932
933                 /*
934                  * We are going to load last pages of buffers where emptying will be
935                  * to. So let's unload any previously loaded buffers.
936                  */
937                 gistUnloadNodeBuffers(gfbb);
938
939                 /*
940                  * Pop tuples from the buffer and run them down to the buffers at
941                  * lower level, or leaf pages. We continue until one of the lower
942                  * level buffers fills up, or this buffer runs empty.
943                  *
944                  * In Arge et al's paper, the buffer emptying is stopped after
945                  * processing 1/2 node buffer worth of tuples, to avoid overfilling
946                  * any of the lower level buffers. However, it's more efficient to
947                  * keep going until one of the lower level buffers actually fills up,
948                  * so that's what we do. This doesn't need to be exact, if a buffer
949                  * overfills by a few tuples, there's no harm done.
950                  */
951                 while (true)
952                 {
953                         IndexTuple      itup;
954
955                         /* Get next index tuple from the buffer */
956                         if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
957                                 break;
958
959                         /*
960                          * Run it down to the underlying node buffer or leaf page.
961                          *
962                          * Note: it's possible that the buffer we're emptying splits as a
963                          * result of this call. If that happens, our emptyingNodeBuffer
964                          * points to the left half of the split. After split, it's very
965                          * likely that the new left buffer is no longer over the half-full
966                          * threshold, but we might as well keep flushing tuples from it
967                          * until we fill a lower-level buffer.
968                          */
969                         if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
970                         {
971                                 /*
972                                  * A lower level buffer filled up. Stop emptying this buffer,
973                                  * to avoid overflowing the lower level buffer.
974                                  */
975                                 break;
976                         }
977
978                         /* Free all the memory allocated during index tuple processing */
979                         MemoryContextReset(buildstate->giststate->tempCxt);
980                 }
981         }
982 }
983
984 /*
985  * Empty all node buffers, from top to bottom. This is done at the end of
986  * index build to flush all remaining tuples to the index.
987  *
988  * Note: This destroys the buffersOnLevels lists, so the buffers should not
989  * be inserted to after this call.
990  */
991 static void
992 gistEmptyAllBuffers(GISTBuildState *buildstate)
993 {
994         GISTBuildBuffers *gfbb = buildstate->gfbb;
995         MemoryContext oldCtx;
996         int                     i;
997
998         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
999
1000         /*
1001          * Iterate through the levels from top to bottom.
1002          */
1003         for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1004         {
1005                 /*
1006                  * Empty all buffers on this level. Note that new buffers can pop up
1007                  * in the list during the processing, as a result of page splits, so a
1008                  * simple walk through the list won't work. We remove buffers from the
1009                  * list when we see them empty; a buffer can't become non-empty once
1010                  * it's been fully emptied.
1011                  */
1012                 while (gfbb->buffersOnLevels[i] != NIL)
1013                 {
1014                         GISTNodeBuffer *nodeBuffer;
1015
1016                         nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1017
1018                         if (nodeBuffer->blocksCount != 0)
1019                         {
1020                                 /*
1021                                  * Add this buffer to the emptying queue, and proceed to empty
1022                                  * the queue.
1023                                  */
1024                                 if (!nodeBuffer->queuedForEmptying)
1025                                 {
1026                                         MemoryContextSwitchTo(gfbb->context);
1027                                         nodeBuffer->queuedForEmptying = true;
1028                                         gfbb->bufferEmptyingQueue =
1029                                                 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1030                                         MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1031                                 }
1032                                 gistProcessEmptyingQueue(buildstate);
1033                         }
1034                         else
1035                                 gfbb->buffersOnLevels[i] =
1036                                         list_delete_first(gfbb->buffersOnLevels[i]);
1037                 }
1038                 elog(DEBUG2, "emptied all buffers at level %d", i);
1039         }
1040         MemoryContextSwitchTo(oldCtx);
1041 }
1042
1043 /*
1044  * Get the depth of the GiST index.
1045  */
1046 static int
1047 gistGetMaxLevel(Relation index)
1048 {
1049         int                     maxLevel;
1050         BlockNumber blkno;
1051
1052         /*
1053          * Traverse down the tree, starting from the root, until we hit the leaf
1054          * level.
1055          */
1056         maxLevel = 0;
1057         blkno = GIST_ROOT_BLKNO;
1058         while (true)
1059         {
1060                 Buffer          buffer;
1061                 Page            page;
1062                 IndexTuple      itup;
1063
1064                 buffer = ReadBuffer(index, blkno);
1065
1066                 /*
1067                  * There's no concurrent access during index build, so locking is just
1068                  * pro forma.
1069                  */
1070                 LockBuffer(buffer, GIST_SHARE);
1071                 page = (Page) BufferGetPage(buffer);
1072
1073                 if (GistPageIsLeaf(page))
1074                 {
1075                         /* We hit the bottom, so we're done. */
1076                         UnlockReleaseBuffer(buffer);
1077                         break;
1078                 }
1079
1080                 /*
1081                  * Pick the first downlink on the page, and follow it. It doesn't
1082                  * matter which downlink we choose, the tree has the same depth
1083                  * everywhere, so we just pick the first one.
1084                  */
1085                 itup = (IndexTuple) PageGetItem(page,
1086                                                                                 PageGetItemId(page, FirstOffsetNumber));
1087                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1088                 UnlockReleaseBuffer(buffer);
1089
1090                 /*
1091                  * We're going down on the tree. It means that there is yet one more
1092                  * level in the tree.
1093                  */
1094                 maxLevel++;
1095         }
1096         return maxLevel;
1097 }
1098
1099
1100 /*
1101  * Routines for managing the parent map.
1102  *
1103  * Whenever a page is split, we need to insert the downlinks into the parent.
1104  * We need to somehow find the parent page to do that. In normal insertions,
1105  * we keep a stack of nodes visited when we descend the tree. However, in
1106  * buffering build, we can start descending the tree from any internal node,
1107  * when we empty a buffer by cascading tuples to its children. So we don't
1108  * have a full stack up to the root available at that time.
1109  *
1110  * So instead, we maintain a hash table to track the parent of every internal
1111  * page. We don't need to track the parents of leaf nodes, however. Whenever
1112  * we insert to a leaf, we've just descended down from its parent, so we know
1113  * its immediate parent already. This helps a lot to limit the memory used
1114  * by this hash table.
1115  *
1116  * Whenever an internal node is split, the parent map needs to be updated.
1117  * the parent of the new child page needs to be recorded, and also the
1118  * entries for all page whose downlinks are moved to a new page at the split
1119  * needs to be updated.
1120  *
1121  * We also update the parent map whenever we descend the tree. That might seem
1122  * unnecessary, because we maintain the map whenever a downlink is moved or
1123  * created, but it is needed because we switch to buffering mode after
1124  * creating a tree with regular index inserts. Any pages created before
1125  * switching to buffering mode will not be present in the parent map initially,
1126  * but will be added there the first time we visit them.
1127  */
1128
1129 typedef struct
1130 {
1131         BlockNumber childblkno;         /* hash key */
1132         BlockNumber parentblkno;
1133 } ParentMapEntry;
1134
1135 static void
1136 gistInitParentMap(GISTBuildState *buildstate)
1137 {
1138         HASHCTL         hashCtl;
1139
1140         hashCtl.keysize = sizeof(BlockNumber);
1141         hashCtl.entrysize = sizeof(ParentMapEntry);
1142         hashCtl.hcxt = CurrentMemoryContext;
1143         buildstate->parentMap = hash_create("gistbuild parent map",
1144                                                                                 1024,
1145                                                                                 &hashCtl,
1146                                                                                 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1147 }
1148
1149 static void
1150 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1151 {
1152         ParentMapEntry *entry;
1153         bool            found;
1154
1155         entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1156                                                                                    (const void *) &child,
1157                                                                                    HASH_ENTER,
1158                                                                                    &found);
1159         entry->parentblkno = parent;
1160 }
1161
1162 /*
1163  * Scan all downlinks on a page, and memorize their parent.
1164  */
1165 static void
1166 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1167 {
1168         OffsetNumber maxoff;
1169         OffsetNumber off;
1170         BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1171         Page            page = BufferGetPage(parentbuf);
1172
1173         Assert(!GistPageIsLeaf(page));
1174
1175         maxoff = PageGetMaxOffsetNumber(page);
1176         for (off = FirstOffsetNumber; off <= maxoff; off++)
1177         {
1178                 ItemId          iid = PageGetItemId(page, off);
1179                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
1180                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1181
1182                 gistMemorizeParent(buildstate, childblkno, parentblkno);
1183         }
1184 }
1185
1186 static BlockNumber
1187 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1188 {
1189         ParentMapEntry *entry;
1190         bool            found;
1191
1192         /* Find node buffer in hash table */
1193         entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1194                                                                                    (const void *) &child,
1195                                                                                    HASH_FIND,
1196                                                                                    &found);
1197         if (!found)
1198                 elog(ERROR, "could not find parent of block %d in lookup table", child);
1199
1200         return entry->parentblkno;
1201 }