]> granicus.if.org Git - postgresql/blob - src/backend/access/gist/gistbuild.c
Delete the temporary file used in buffered GiST build, after the build.
[postgresql] / src / backend / access / gist / gistbuild.c
1 /*-------------------------------------------------------------------------
2  *
3  * gistbuild.c
4  *        build algorithm for GiST indexes implementation.
5  *
6  *
7  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *        src/backend/access/gist/gistbuild.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/genam.h"
20 #include "access/gist_private.h"
21 #include "catalog/index.h"
22 #include "miscadmin.h"
23 #include "optimizer/cost.h"
24 #include "storage/bufmgr.h"
25 #include "storage/smgr.h"
26 #include "utils/memutils.h"
27 #include "utils/rel.h"
28
29 /* Step of index tuples for check whether to switch to buffering build mode */
30 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
31
32 /*
33  * Number of tuples to process in the slow way before switching to buffering
34  * mode, when buffering is explicitly turned on. Also, the number of tuples
35  * to process between readjusting the buffer size parameter, while in
36  * buffering mode.
37  */
38 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
39
40 typedef enum
41 {
42         GIST_BUFFERING_DISABLED,        /* in regular build mode and aren't going to
43                                                                  * switch */
44         GIST_BUFFERING_AUTO,            /* in regular build mode, but will switch to
45                                                                  * buffering build mode if the index grows too
46                                                                  * big */
47         GIST_BUFFERING_STATS,           /* gathering statistics of index tuple size
48                                                                  * before switching to the buffering build
49                                                                  * mode */
50         GIST_BUFFERING_ACTIVE           /* in buffering build mode */
51 }       GistBufferingMode;
52
53 /* Working state for gistbuild and its callback */
54 typedef struct
55 {
56         Relation        indexrel;
57         GISTSTATE  *giststate;
58         GISTBuildBuffers *gfbb;
59
60         int64           indtuples;              /* number of tuples indexed */
61         int64           indtuplesSize;  /* total size of all indexed tuples */
62
63         Size            freespace;              /* amount of free space to leave on pages */
64
65         GistBufferingMode bufferingMode;
66 } GISTBuildState;
67
68 static void gistInitBuffering(GISTBuildState *buildstate);
69 static int      calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
70 static void gistBuildCallback(Relation index,
71                                   HeapTuple htup,
72                                   Datum *values,
73                                   bool *isnull,
74                                   bool tupleIsAlive,
75                                   void *state);
76 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
77                                                  IndexTuple itup);
78 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
79                                 GISTBufferingInsertStack *startparent);
80 static void gistbufferinginserttuples(GISTBuildState *buildstate,
81                                                   Buffer buffer,
82                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
83                                                   GISTBufferingInsertStack *path);
84 static void gistBufferingFindCorrectParent(GISTBuildState *buildstate,
85                                                            GISTBufferingInsertStack *child);
86 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
87 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
88 static void gistFreeUnreferencedPath(GISTBufferingInsertStack *path);
89 static int      gistGetMaxLevel(Relation index);
90
91
92 /*
93  * Main entry point to GiST index build. Initially calls insert over and over,
94  * but switches to more efficient buffering build algorithm after a certain
95  * number of tuples (unless buffering mode is disabled).
96  */
97 Datum
98 gistbuild(PG_FUNCTION_ARGS)
99 {
100         Relation        heap = (Relation) PG_GETARG_POINTER(0);
101         Relation        index = (Relation) PG_GETARG_POINTER(1);
102         IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
103         IndexBuildResult *result;
104         double          reltuples;
105         GISTBuildState buildstate;
106         Buffer          buffer;
107         Page            page;
108         MemoryContext oldcxt = CurrentMemoryContext;
109         int                     fillfactor;
110
111         buildstate.indexrel = index;
112         if (index->rd_options)
113         {
114                 /* Get buffering mode from the options string */
115                 GiSTOptions *options = (GiSTOptions *) index->rd_options;
116                 char       *bufferingMode = (char *) options + options->bufferingModeOffset;
117
118                 if (strcmp(bufferingMode, "on") == 0)
119                         buildstate.bufferingMode = GIST_BUFFERING_STATS;
120                 else if (strcmp(bufferingMode, "off") == 0)
121                         buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
122                 else
123                         buildstate.bufferingMode = GIST_BUFFERING_AUTO;
124
125                 fillfactor = options->fillfactor;
126         }
127         else
128         {
129                 /*
130                  * By default, switch to buffering mode when the index grows too large
131                  * to fit in cache.
132                  */
133                 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
134                 fillfactor = GIST_DEFAULT_FILLFACTOR;
135         }
136         /* Calculate target amount of free space to leave on pages */
137         buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
138
139         /*
140          * We expect to be called exactly once for any index relation. If that's
141          * not the case, big trouble's what we have.
142          */
143         if (RelationGetNumberOfBlocks(index) != 0)
144                 elog(ERROR, "index \"%s\" already contains data",
145                          RelationGetRelationName(index));
146
147         /*
148          * We can't yet handle unlogged GiST indexes, because we depend on LSNs.
149          * This is duplicative of an error in gistbuildempty, but we want to check
150          * here so as to throw error before doing all the index-build work.
151          */
152         if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
153                 ereport(ERROR,
154                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
155                                  errmsg("unlogged GiST indexes are not supported")));
156
157         /* no locking is needed */
158         buildstate.giststate = initGISTstate(index);
159
160         /*
161          * Create a temporary memory context that is reset once for each tuple
162          * processed.  (Note: we don't bother to make this a child of the
163          * giststate's scanCxt, so we have to delete it separately at the end.)
164          */
165         buildstate.giststate->tempCxt = createTempGistContext();
166
167         /* initialize the root page */
168         buffer = gistNewBuffer(index);
169         Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
170         page = BufferGetPage(buffer);
171
172         START_CRIT_SECTION();
173
174         GISTInitBuffer(buffer, F_LEAF);
175
176         MarkBufferDirty(buffer);
177
178         if (RelationNeedsWAL(index))
179         {
180                 XLogRecPtr      recptr;
181                 XLogRecData rdata;
182
183                 rdata.data = (char *) &(index->rd_node);
184                 rdata.len = sizeof(RelFileNode);
185                 rdata.buffer = InvalidBuffer;
186                 rdata.next = NULL;
187
188                 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
189                 PageSetLSN(page, recptr);
190                 PageSetTLI(page, ThisTimeLineID);
191         }
192         else
193                 PageSetLSN(page, GetXLogRecPtrForTemp());
194
195         UnlockReleaseBuffer(buffer);
196
197         END_CRIT_SECTION();
198
199         /* build the index */
200         buildstate.indtuples = 0;
201         buildstate.indtuplesSize = 0;
202
203         /*
204          * Do the heap scan.
205          */
206         reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
207                                                                    gistBuildCallback, (void *) &buildstate);
208
209         /*
210          * If buffering was used, flush out all the tuples that are still in the
211          * buffers.
212          */
213         if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
214         {
215                 elog(DEBUG1, "all tuples processed, emptying buffers");
216                 gistEmptyAllBuffers(&buildstate);
217                 gistFreeBuildBuffers(buildstate.gfbb);
218         }
219
220         /* okay, all heap tuples are indexed */
221         MemoryContextSwitchTo(oldcxt);
222         MemoryContextDelete(buildstate.giststate->tempCxt);
223
224         freeGISTstate(buildstate.giststate);
225
226         /*
227          * Return statistics
228          */
229         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
230
231         result->heap_tuples = reltuples;
232         result->index_tuples = (double) buildstate.indtuples;
233
234         PG_RETURN_POINTER(result);
235 }
236
237 /*
238  * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
239  * and "auto" values.
240  */
241 void
242 gistValidateBufferingOption(char *value)
243 {
244         if (value == NULL ||
245                 (strcmp(value, "on") != 0 &&
246                  strcmp(value, "off") != 0 &&
247                  strcmp(value, "auto") != 0))
248         {
249                 ereport(ERROR,
250                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
251                                  errmsg("invalid value for \"buffering\" option"),
252                            errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
253         }
254 }
255
256 /*
257  * Attempt to switch to buffering mode.
258  *
259  * If there is not enough memory for buffering build, sets bufferingMode
260  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
261  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
262  * GIST_BUFFERING_ACTIVE.
263  */
264 static void
265 gistInitBuffering(GISTBuildState *buildstate)
266 {
267         Relation        index = buildstate->indexrel;
268         int                     pagesPerBuffer;
269         Size            pageFreeSpace;
270         Size            itupAvgSize,
271                                 itupMinSize;
272         double          avgIndexTuplesPerPage,
273                                 maxIndexTuplesPerPage;
274         int                     i;
275         int                     levelStep;
276
277         /* Calc space of index page which is available for index tuples */
278         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
279                 - sizeof(ItemIdData)
280                 - buildstate->freespace;
281
282         /*
283          * Calculate average size of already inserted index tuples using gathered
284          * statistics.
285          */
286         itupAvgSize = (double) buildstate->indtuplesSize /
287                 (double) buildstate->indtuples;
288
289         /*
290          * Calculate minimal possible size of index tuple by index metadata.
291          * Minimal possible size of varlena is VARHDRSZ.
292          *
293          * XXX: that's not actually true, as a short varlen can be just 2 bytes.
294          * And we should take padding into account here.
295          */
296         itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
297         for (i = 0; i < index->rd_att->natts; i++)
298         {
299                 if (index->rd_att->attrs[i]->attlen < 0)
300                         itupMinSize += VARHDRSZ;
301                 else
302                         itupMinSize += index->rd_att->attrs[i]->attlen;
303         }
304
305         /* Calculate average and maximal number of index tuples which fit to page */
306         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
307         maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
308
309         /*
310          * We need to calculate two parameters for the buffering algorithm:
311          * levelStep and pagesPerBuffer.
312          *
313          * levelStep determines the size of subtree that we operate on, while
314          * emptying a buffer. A higher value is better, as you need fewer buffer
315          * emptying steps to build the index. However, if you set it too high, the
316          * subtree doesn't fit in cache anymore, and you quickly lose the benefit
317          * of the buffers.
318          *
319          * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
320          * the number of tuples on page (ie. fanout), and M is the amount of
321          * internal memory available. Curiously, they doesn't explain *why* that
322          * setting is optimal. We calculate it by taking the highest levelStep so
323          * that a subtree still fits in cache. For a small B, our way of
324          * calculating levelStep is very close to Arge et al's formula. For a
325          * large B, our formula gives a value that is 2x higher.
326          *
327          * The average size (in pages) of a subtree of depth n can be calculated
328          * as a geometric series:
329          *
330          * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
331          *
332          * where B is the average number of index tuples on page. The subtree is
333          * cached in the shared buffer cache and the OS cache, so we choose
334          * levelStep so that the subtree size is comfortably smaller than
335          * effective_cache_size, with a safety factor of 4.
336          *
337          * The estimate on the average number of index tuples on page is based on
338          * average tuple sizes observed before switching to buffered build, so the
339          * real subtree size can be somewhat larger. Also, it would selfish to
340          * gobble the whole cache for our index build. The safety factor of 4
341          * should account for those effects.
342          *
343          * The other limiting factor for setting levelStep is that while
344          * processing a subtree, we need to hold one page for each buffer at the
345          * next lower buffered level. The max. number of buffers needed for that
346          * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
347          * hopefully maintenance_work_mem is set high enough that you're
348          * constrained by effective_cache_size rather than maintenance_work_mem.
349          *
350          * XXX: the buffer hash table consumes a fair amount of memory too per
351          * buffer, but that is not currently taken into account. That scales on
352          * the total number of buffers used, ie. the index size and on levelStep.
353          * Note that a higher levelStep *reduces* the amount of memory needed for
354          * the hash table.
355          */
356         levelStep = 1;
357         for (;;)
358         {
359                 double          subtreesize;
360                 double          maxlowestlevelpages;
361
362                 /* size of an average subtree at this levelStep (in pages). */
363                 subtreesize =
364                         (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
365                         (1 - avgIndexTuplesPerPage);
366
367                 /* max number of pages at the lowest level of a subtree */
368                 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
369
370                 /* subtree must fit in cache (with safety factor of 4) */
371                 if (subtreesize > effective_cache_size / 4)
372                         break;
373
374                 /* each node in the lowest level of a subtree has one page in memory */
375                 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
376                         break;
377
378                 /* Good, we can handle this levelStep. See if we can go one higher. */
379                 levelStep++;
380         }
381
382         /*
383          * We just reached an unacceptable value of levelStep in previous loop.
384          * So, decrease levelStep to get last acceptable value.
385          */
386         levelStep--;
387
388         /*
389          * If there's not enough cache or maintenance_work_mem, fall back to plain
390          * inserts.
391          */
392         if (levelStep <= 0)
393         {
394                 elog(DEBUG1, "failed to switch to buffered GiST build");
395                 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
396                 return;
397         }
398
399         /*
400          * The second parameter to set is pagesPerBuffer, which determines the
401          * size of each buffer. We adjust pagesPerBuffer also during the build,
402          * which is why this calculation is in a separate function.
403          */
404         pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
405
406         /* Initialize GISTBuildBuffers with these parameters */
407         buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
408                                                                                         gistGetMaxLevel(index));
409
410         buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
411
412         elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
413                  levelStep, pagesPerBuffer);
414 }
415
416 /*
417  * Calculate pagesPerBuffer parameter for the buffering algorithm.
418  *
419  * Buffer size is chosen so that assuming that tuples are distributed
420  * randomly, emptying half a buffer fills on average one page in every buffer
421  * at the next lower level.
422  */
423 static int
424 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
425 {
426         double          pagesPerBuffer;
427         double          avgIndexTuplesPerPage;
428         double          itupAvgSize;
429         Size            pageFreeSpace;
430
431         /* Calc space of index page which is available for index tuples */
432         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
433                 - sizeof(ItemIdData)
434                 - buildstate->freespace;
435
436         /*
437          * Calculate average size of already inserted index tuples using gathered
438          * statistics.
439          */
440         itupAvgSize = (double) buildstate->indtuplesSize /
441                 (double) buildstate->indtuples;
442
443         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
444
445         /*
446          * Recalculate required size of buffers.
447          */
448         pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
449
450         return (int) rint(pagesPerBuffer);
451 }
452
453 /*
454  * Per-tuple callback from IndexBuildHeapScan.
455  */
456 static void
457 gistBuildCallback(Relation index,
458                                   HeapTuple htup,
459                                   Datum *values,
460                                   bool *isnull,
461                                   bool tupleIsAlive,
462                                   void *state)
463 {
464         GISTBuildState *buildstate = (GISTBuildState *) state;
465         IndexTuple      itup;
466         MemoryContext oldCtx;
467
468         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
469
470         /* form an index tuple and point it at the heap tuple */
471         itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
472         itup->t_tid = htup->t_self;
473
474         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
475         {
476                 /* We have buffers, so use them. */
477                 gistBufferingBuildInsert(buildstate, itup);
478         }
479         else
480         {
481                 /*
482                  * There's no buffers (yet). Since we already have the index relation
483                  * locked, we call gistdoinsert directly.
484                  */
485                 gistdoinsert(index, itup, buildstate->freespace,
486                                          buildstate->giststate);
487         }
488
489         /* Update tuple count and total size. */
490         buildstate->indtuples += 1;
491         buildstate->indtuplesSize += IndexTupleSize(itup);
492
493         MemoryContextSwitchTo(oldCtx);
494         MemoryContextReset(buildstate->giststate->tempCxt);
495
496         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
497                 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
498         {
499                 /* Adjust the target buffer size now */
500                 buildstate->gfbb->pagesPerBuffer =
501                         calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
502         }
503
504         /*
505          * In 'auto' mode, check if the index has grown too large to fit in cache,
506          * and switch to buffering mode if it has.
507          *
508          * To avoid excessive calls to smgrnblocks(), only check this every
509          * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
510          */
511         if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
512                  buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
513                  effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
514                 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
515                  buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
516         {
517                 /*
518                  * Index doesn't fit in effective cache anymore. Try to switch to
519                  * buffering build mode.
520                  */
521                 gistInitBuffering(buildstate);
522         }
523 }
524
525 /*
526  * Insert function for buffering index build.
527  */
528 static void
529 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
530 {
531         /* Insert the tuple to buffers. */
532         gistProcessItup(buildstate, itup, NULL);
533
534         /* If we filled up (half of a) buffer, process buffer emptying. */
535         gistProcessEmptyingQueue(buildstate);
536 }
537
538 /*
539  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
540  * page or node buffer, and inserts the tuple there. Returns true if we have
541  * to stop buffer emptying process (because one of child buffers can't take
542  * index tuples anymore).
543  */
544 static bool
545 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
546                                 GISTBufferingInsertStack *startparent)
547 {
548         GISTSTATE  *giststate = buildstate->giststate;
549         GISTBuildBuffers *gfbb = buildstate->gfbb;
550         Relation        indexrel = buildstate->indexrel;
551         GISTBufferingInsertStack *path;
552         BlockNumber childblkno;
553         Buffer          buffer;
554         bool            result = false;
555
556         /*
557          * NULL passed in startparent means that we start index tuple processing
558          * from the root.
559          */
560         if (!startparent)
561                 path = gfbb->rootitem;
562         else
563                 path = startparent;
564
565         /*
566          * Loop until we reach a leaf page (level == 0) or a level with buffers
567          * (not including the level we start at, because we would otherwise make
568          * no progress).
569          */
570         for (;;)
571         {
572                 ItemId          iid;
573                 IndexTuple      idxtuple,
574                                         newtup;
575                 Page            page;
576                 OffsetNumber childoffnum;
577                 GISTBufferingInsertStack *parent;
578
579                 /* Have we reached a level with buffers? */
580                 if (LEVEL_HAS_BUFFERS(path->level, gfbb) && path != startparent)
581                         break;
582
583                 /* Have we reached a leaf page? */
584                 if (path->level == 0)
585                         break;
586
587                 /*
588                  * Nope. Descend down to the next level then. Choose a child to
589                  * descend down to.
590                  */
591                 buffer = ReadBuffer(indexrel, path->blkno);
592                 LockBuffer(buffer, GIST_EXCLUSIVE);
593
594                 page = (Page) BufferGetPage(buffer);
595                 childoffnum = gistchoose(indexrel, page, itup, giststate);
596                 iid = PageGetItemId(page, childoffnum);
597                 idxtuple = (IndexTuple) PageGetItem(page, iid);
598                 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
599
600                 /*
601                  * Check that the key representing the target child node is consistent
602                  * with the key we're inserting. Update it if it's not.
603                  */
604                 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
605                 if (newtup)
606                         gistbufferinginserttuples(buildstate, buffer, &newtup, 1,
607                                                                           childoffnum, path);
608                 UnlockReleaseBuffer(buffer);
609
610                 /* Create new path item representing current page */
611                 parent = path;
612                 path = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
613                                                                                    sizeof(GISTBufferingInsertStack));
614                 path->parent = parent;
615                 path->level = parent->level - 1;
616                 path->blkno = childblkno;
617                 path->downlinkoffnum = childoffnum;
618                 path->refCount = 0;             /* it's unreferenced for now */
619
620                 /* Adjust reference count of parent */
621                 if (parent)
622                         parent->refCount++;
623         }
624
625         if (LEVEL_HAS_BUFFERS(path->level, gfbb))
626         {
627                 /*
628                  * We've reached level with buffers. Place the index tuple to the
629                  * buffer, and add the buffer to the emptying queue if it overflows.
630                  */
631                 GISTNodeBuffer *childNodeBuffer;
632
633                 /* Find the buffer or create a new one */
634                 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, path->blkno,
635                                                                                  path->downlinkoffnum, path->parent);
636
637                 /* Add index tuple to it */
638                 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
639
640                 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
641                         result = true;
642         }
643         else
644         {
645                 /*
646                  * We've reached a leaf page. Place the tuple here.
647                  */
648                 buffer = ReadBuffer(indexrel, path->blkno);
649                 LockBuffer(buffer, GIST_EXCLUSIVE);
650                 gistbufferinginserttuples(buildstate, buffer, &itup, 1,
651                                                                   InvalidOffsetNumber, path);
652                 UnlockReleaseBuffer(buffer);
653         }
654
655         /*
656          * Free unreferenced path items, if any. Path item may be referenced by
657          * node buffer.
658          */
659         gistFreeUnreferencedPath(path);
660
661         return result;
662 }
663
664 /*
665  * Insert tuples to a given page.
666  *
667  * This is analogous with gistinserttuples() in the regular insertion code.
668  */
669 static void
670 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
671                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
672                                                   GISTBufferingInsertStack *path)
673 {
674         GISTBuildBuffers *gfbb = buildstate->gfbb;
675         List       *splitinfo;
676         bool            is_split;
677
678         is_split = gistplacetopage(buildstate->indexrel,
679                                                            buildstate->freespace,
680                                                            buildstate->giststate,
681                                                            buffer,
682                                                            itup, ntup, oldoffnum,
683                                                            InvalidBuffer,
684                                                            &splitinfo,
685                                                            false);
686
687         /*
688          * If this is a root split, update the root path item kept in memory. This
689          * ensures that all path stacks are always complete, including all parent
690          * nodes up to the root. That simplifies the algorithm to re-find correct
691          * parent.
692          */
693         if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
694         {
695                 GISTBufferingInsertStack *oldroot = gfbb->rootitem;
696                 Page            page = BufferGetPage(buffer);
697                 ItemId          iid;
698                 IndexTuple      idxtuple;
699                 BlockNumber leftmostchild;
700
701                 gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
702                                                         gfbb->context, sizeof(GISTBufferingInsertStack));
703                 gfbb->rootitem->parent = NULL;
704                 gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
705                 gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
706                 gfbb->rootitem->level = oldroot->level + 1;
707                 gfbb->rootitem->refCount = 1;
708
709                 /*
710                  * All the downlinks on the old root page are now on one of the child
711                  * pages. Change the block number of the old root entry in the stack
712                  * to point to the leftmost child. The other child pages will be
713                  * accessible from there by walking right.
714                  */
715                 iid = PageGetItemId(page, FirstOffsetNumber);
716                 idxtuple = (IndexTuple) PageGetItem(page, iid);
717                 leftmostchild = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
718
719                 oldroot->parent = gfbb->rootitem;
720                 oldroot->blkno = leftmostchild;
721                 oldroot->downlinkoffnum = InvalidOffsetNumber;
722         }
723
724         if (splitinfo)
725         {
726                 /*
727                  * Insert the downlinks to the parent. This is analogous with
728                  * gistfinishsplit() in the regular insertion code, but the locking is
729                  * simpler, and we have to maintain the buffers.
730                  */
731                 IndexTuple *downlinks;
732                 int                     ndownlinks,
733                                         i;
734                 Buffer          parentBuffer;
735                 ListCell   *lc;
736
737                 /* Parent may have changed since we memorized this path. */
738                 gistBufferingFindCorrectParent(buildstate, path);
739
740                 /*
741                  * If there's a buffer associated with this page, that needs to be
742                  * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
743                  * downlinks in 'splitinfo', to make sure they're consistent not only
744                  * with the tuples already on the pages, but also the tuples in the
745                  * buffers that will eventually be inserted to them.
746                  */
747                 gistRelocateBuildBuffersOnSplit(gfbb,
748                                                                                 buildstate->giststate,
749                                                                                 buildstate->indexrel,
750                                                                                 path, buffer, splitinfo);
751
752                 /* Create an array of all the downlink tuples */
753                 ndownlinks = list_length(splitinfo);
754                 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
755                 i = 0;
756                 foreach(lc, splitinfo)
757                 {
758                         GISTPageSplitInfo *splitinfo = lfirst(lc);
759
760                         /*
761                          * Since there's no concurrent access, we can release the lower
762                          * level buffers immediately. Don't release the buffer for the
763                          * original page, though, because the caller will release that.
764                          */
765                         if (splitinfo->buf != buffer)
766                                 UnlockReleaseBuffer(splitinfo->buf);
767                         downlinks[i++] = splitinfo->downlink;
768                 }
769
770                 /* Insert them into parent. */
771                 parentBuffer = ReadBuffer(buildstate->indexrel, path->parent->blkno);
772                 LockBuffer(parentBuffer, GIST_EXCLUSIVE);
773                 gistbufferinginserttuples(buildstate, parentBuffer,
774                                                                   downlinks, ndownlinks,
775                                                                   path->downlinkoffnum, path->parent);
776                 UnlockReleaseBuffer(parentBuffer);
777
778                 list_free_deep(splitinfo);              /* we don't need this anymore */
779         }
780 }
781
782 /*
783  * Find correct parent by following rightlinks in buffering index build. This
784  * method of parent searching is possible because no concurrent activity is
785  * possible while index builds.
786  */
787 static void
788 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
789                                                            GISTBufferingInsertStack *child)
790 {
791         GISTBuildBuffers *gfbb = buildstate->gfbb;
792         Relation        indexrel = buildstate->indexrel;
793         GISTBufferingInsertStack *parent = child->parent;
794         OffsetNumber i,
795                                 maxoff;
796         ItemId          iid;
797         IndexTuple      idxtuple;
798         Buffer          buffer;
799         Page            page;
800         bool            copied = false;
801
802         buffer = ReadBuffer(indexrel, parent->blkno);
803         page = BufferGetPage(buffer);
804         LockBuffer(buffer, GIST_EXCLUSIVE);
805         gistcheckpage(indexrel, buffer);
806
807         /* Check if it was not moved */
808         if (child->downlinkoffnum != InvalidOffsetNumber &&
809                 child->downlinkoffnum <= PageGetMaxOffsetNumber(page))
810         {
811                 iid = PageGetItemId(page, child->downlinkoffnum);
812                 idxtuple = (IndexTuple) PageGetItem(page, iid);
813                 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
814                 {
815                         /* Still there */
816                         UnlockReleaseBuffer(buffer);
817                         return;
818                 }
819         }
820
821         /* parent has changed, look child in right links until found */
822         while (true)
823         {
824                 /* Search for relevant downlink in the current page */
825                 maxoff = PageGetMaxOffsetNumber(page);
826                 for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
827                 {
828                         iid = PageGetItemId(page, i);
829                         idxtuple = (IndexTuple) PageGetItem(page, iid);
830                         if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
831                         {
832                                 /* yes!!, found */
833                                 child->downlinkoffnum = i;
834                                 UnlockReleaseBuffer(buffer);
835                                 return;
836                         }
837                 }
838
839                 /*
840                  * We should copy parent path item because some other path items can
841                  * refer to it.
842                  */
843                 if (!copied)
844                 {
845                         parent = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
846                                                                                    sizeof(GISTBufferingInsertStack));
847                         memcpy(parent, child->parent, sizeof(GISTBufferingInsertStack));
848                         if (parent->parent)
849                                 parent->parent->refCount++;
850                         gistDecreasePathRefcount(child->parent);
851                         child->parent = parent;
852                         parent->refCount = 1;
853                         copied = true;
854                 }
855
856                 /*
857                  * Not found in current page. Move towards rightlink.
858                  */
859                 parent->blkno = GistPageGetOpaque(page)->rightlink;
860                 UnlockReleaseBuffer(buffer);
861
862                 if (parent->blkno == InvalidBlockNumber)
863                 {
864                         /*
865                          * End of chain and still didn't find parent. Should not happen
866                          * during index build.
867                          */
868                         break;
869                 }
870
871                 /* Get the next page */
872                 buffer = ReadBuffer(indexrel, parent->blkno);
873                 page = BufferGetPage(buffer);
874                 LockBuffer(buffer, GIST_EXCLUSIVE);
875                 gistcheckpage(indexrel, buffer);
876         }
877
878         elog(ERROR, "failed to re-find parent for block %u", child->blkno);
879 }
880
881 /*
882  * Process buffers emptying stack. Emptying of one buffer can cause emptying
883  * of other buffers. This function iterates until this cascading emptying
884  * process finished, e.g. until buffers emptying stack is empty.
885  */
886 static void
887 gistProcessEmptyingQueue(GISTBuildState *buildstate)
888 {
889         GISTBuildBuffers *gfbb = buildstate->gfbb;
890
891         /* Iterate while we have elements in buffers emptying stack. */
892         while (gfbb->bufferEmptyingQueue != NIL)
893         {
894                 GISTNodeBuffer *emptyingNodeBuffer;
895
896                 /* Get node buffer from emptying stack. */
897                 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
898                 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
899                 emptyingNodeBuffer->queuedForEmptying = false;
900
901                 /*
902                  * We are going to load last pages of buffers where emptying will be
903                  * to. So let's unload any previously loaded buffers.
904                  */
905                 gistUnloadNodeBuffers(gfbb);
906
907                 /*
908                  * Pop tuples from the buffer and run them down to the buffers at
909                  * lower level, or leaf pages. We continue until one of the lower
910                  * level buffers fills up, or this buffer runs empty.
911                  *
912                  * In Arge et al's paper, the buffer emptying is stopped after
913                  * processing 1/2 node buffer worth of tuples, to avoid overfilling
914                  * any of the lower level buffers. However, it's more efficient to
915                  * keep going until one of the lower level buffers actually fills up,
916                  * so that's what we do. This doesn't need to be exact, if a buffer
917                  * overfills by a few tuples, there's no harm done.
918                  */
919                 while (true)
920                 {
921                         IndexTuple      itup;
922
923                         /* Get next index tuple from the buffer */
924                         if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
925                                 break;
926
927                         /*
928                          * Run it down to the underlying node buffer or leaf page.
929                          *
930                          * Note: it's possible that the buffer we're emptying splits as a
931                          * result of this call. If that happens, our emptyingNodeBuffer
932                          * points to the left half of the split. After split, it's very
933                          * likely that the new left buffer is no longer over the half-full
934                          * threshold, but we might as well keep flushing tuples from it
935                          * until we fill a lower-level buffer.
936                          */
937                         if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->path))
938                         {
939                                 /*
940                                  * A lower level buffer filled up. Stop emptying this buffer,
941                                  * to avoid overflowing the lower level buffer.
942                                  */
943                                 break;
944                         }
945
946                         /* Free all the memory allocated during index tuple processing */
947                         MemoryContextReset(buildstate->giststate->tempCxt);
948                 }
949         }
950 }
951
952 /*
953  * Empty all node buffers, from top to bottom. This is done at the end of
954  * index build to flush all remaining tuples to the index.
955  *
956  * Note: This destroys the buffersOnLevels lists, so the buffers should not
957  * be inserted to after this call.
958  */
959 static void
960 gistEmptyAllBuffers(GISTBuildState *buildstate)
961 {
962         GISTBuildBuffers *gfbb = buildstate->gfbb;
963         MemoryContext oldCtx;
964         int                     i;
965
966         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
967
968         /*
969          * Iterate through the levels from top to bottom.
970          */
971         for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
972         {
973                 /*
974                  * Empty all buffers on this level. Note that new buffers can pop up
975                  * in the list during the processing, as a result of page splits, so a
976                  * simple walk through the list won't work. We remove buffers from the
977                  * list when we see them empty; a buffer can't become non-empty once
978                  * it's been fully emptied.
979                  */
980                 while (gfbb->buffersOnLevels[i] != NIL)
981                 {
982                         GISTNodeBuffer *nodeBuffer;
983
984                         nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
985
986                         if (nodeBuffer->blocksCount != 0)
987                         {
988                                 /*
989                                  * Add this buffer to the emptying queue, and proceed to empty
990                                  * the queue.
991                                  */
992                                 if (!nodeBuffer->queuedForEmptying)
993                                 {
994                                         MemoryContextSwitchTo(gfbb->context);
995                                         nodeBuffer->queuedForEmptying = true;
996                                         gfbb->bufferEmptyingQueue =
997                                                 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
998                                         MemoryContextSwitchTo(buildstate->giststate->tempCxt);
999                                 }
1000                                 gistProcessEmptyingQueue(buildstate);
1001                         }
1002                         else
1003                                 gfbb->buffersOnLevels[i] =
1004                                         list_delete_first(gfbb->buffersOnLevels[i]);
1005                 }
1006         }
1007         MemoryContextSwitchTo(oldCtx);
1008 }
1009
1010 /*
1011  * Free unreferenced parts of a path stack.
1012  */
1013 static void
1014 gistFreeUnreferencedPath(GISTBufferingInsertStack *path)
1015 {
1016         while (path->refCount == 0)
1017         {
1018                 /*
1019                  * Path part is unreferenced. We can free it and decrease reference
1020                  * count of parent. If parent becomes unreferenced too procedure
1021                  * should be repeated for it.
1022                  */
1023                 GISTBufferingInsertStack *tmp = path->parent;
1024
1025                 pfree(path);
1026                 path = tmp;
1027                 if (path)
1028                         path->refCount--;
1029                 else
1030                         break;
1031         }
1032 }
1033
1034 /*
1035  * Decrease reference count of a path part, and free any unreferenced parts of
1036  * the path stack.
1037  */
1038 void
1039 gistDecreasePathRefcount(GISTBufferingInsertStack *path)
1040 {
1041         path->refCount--;
1042         gistFreeUnreferencedPath(path);
1043 }
1044
1045 /*
1046  * Get the depth of the GiST index.
1047  */
1048 static int
1049 gistGetMaxLevel(Relation index)
1050 {
1051         int                     maxLevel;
1052         BlockNumber blkno;
1053
1054         /*
1055          * Traverse down the tree, starting from the root, until we hit the leaf
1056          * level.
1057          */
1058         maxLevel = 0;
1059         blkno = GIST_ROOT_BLKNO;
1060         while (true)
1061         {
1062                 Buffer          buffer;
1063                 Page            page;
1064                 IndexTuple      itup;
1065
1066                 buffer = ReadBuffer(index, blkno);
1067
1068                 /*
1069                  * There's no concurrent access during index build, so locking is just
1070                  * pro forma.
1071                  */
1072                 LockBuffer(buffer, GIST_SHARE);
1073                 page = (Page) BufferGetPage(buffer);
1074
1075                 if (GistPageIsLeaf(page))
1076                 {
1077                         /* We hit the bottom, so we're done. */
1078                         UnlockReleaseBuffer(buffer);
1079                         break;
1080                 }
1081
1082                 /*
1083                  * Pick the first downlink on the page, and follow it. It doesn't
1084                  * matter which downlink we choose, the tree has the same depth
1085                  * everywhere, so we just pick the first one.
1086                  */
1087                 itup = (IndexTuple) PageGetItem(page,
1088                                                                          PageGetItemId(page, FirstOffsetNumber));
1089                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1090                 UnlockReleaseBuffer(buffer);
1091
1092                 /*
1093                  * We're going down on the tree. It means that there is yet one more
1094                  * level is the tree.
1095                  */
1096                 maxLevel++;
1097         }
1098         return maxLevel;
1099 }