]> granicus.if.org Git - postgresql/blob - src/backend/access/gist/gistbuild.c
8caf4856763b68828c6f23d9c1df6c9a60f8e7b5
[postgresql] / src / backend / access / gist / gistbuild.c
1 /*-------------------------------------------------------------------------
2  *
3  * gistbuild.c
4  *        build algorithm for GiST indexes implementation.
5  *
6  *
7  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *        src/backend/access/gist/gistbuild.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/genam.h"
20 #include "access/gist_private.h"
21 #include "catalog/index.h"
22 #include "miscadmin.h"
23 #include "optimizer/cost.h"
24 #include "storage/bufmgr.h"
25 #include "storage/smgr.h"
26 #include "utils/memutils.h"
27 #include "utils/rel.h"
28
29 /* Step of index tuples for check whether to switch to buffering build mode */
30 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
31
32 /*
33  * Number of tuples to process in the slow way before switching to buffering
34  * mode, when buffering is explicitly turned on. Also, the number of tuples
35  * to process between readjusting the buffer size parameter, while in
36  * buffering mode.
37  */
38 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
39
40 typedef enum
41 {
42         GIST_BUFFERING_DISABLED,        /* in regular build mode and aren't going to
43                                                                  * switch */
44         GIST_BUFFERING_AUTO,            /* in regular build mode, but will switch to
45                                                                  * buffering build mode if the index grows too
46                                                                  * big */
47         GIST_BUFFERING_STATS,           /* gathering statistics of index tuple size
48                                                                  * before switching to the buffering build
49                                                                  * mode */
50         GIST_BUFFERING_ACTIVE           /* in buffering build mode */
51 } GistBufferingMode;
52
53 /* Working state for gistbuild and its callback */
54 typedef struct
55 {
56         Relation        indexrel;
57         GISTSTATE  *giststate;
58
59         int64           indtuples;              /* number of tuples indexed */
60         int64           indtuplesSize;  /* total size of all indexed tuples */
61
62         Size            freespace;              /* amount of free space to leave on pages */
63
64         /*
65          * Extra data structures used during a buffering build. 'gfbb' contains
66          * information related to managing the build buffers. 'parentMap' is a
67          * lookup table of the parent of each internal page.
68          */
69         GISTBuildBuffers *gfbb;
70         HTAB       *parentMap;
71
72         GistBufferingMode bufferingMode;
73 } GISTBuildState;
74
75 /* prototypes for private functions */
76 static void gistInitBuffering(GISTBuildState *buildstate);
77 static int      calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
78 static void gistBuildCallback(Relation index,
79                                   HeapTuple htup,
80                                   Datum *values,
81                                   bool *isnull,
82                                   bool tupleIsAlive,
83                                   void *state);
84 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
85                                                  IndexTuple itup);
86 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
87                                 BlockNumber startblkno, int startlevel);
88 static void gistbufferinginserttuples(GISTBuildState *buildstate,
89                                                   Buffer buffer, int level,
90                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
91                                                   BlockNumber parentblk, OffsetNumber downlinkoffnum);
92 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
93                                                            BlockNumber childblkno, int level,
94                                                            BlockNumber *parentblk,
95                                                            OffsetNumber *downlinkoffnum);
96 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
97 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
98 static int      gistGetMaxLevel(Relation index);
99
100 static void gistInitParentMap(GISTBuildState *buildstate);
101 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
102                                    BlockNumber parent);
103 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
104 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
105
106 /*
107  * Main entry point to GiST index build. Initially calls insert over and over,
108  * but switches to more efficient buffering build algorithm after a certain
109  * number of tuples (unless buffering mode is disabled).
110  */
111 Datum
112 gistbuild(PG_FUNCTION_ARGS)
113 {
114         Relation        heap = (Relation) PG_GETARG_POINTER(0);
115         Relation        index = (Relation) PG_GETARG_POINTER(1);
116         IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
117         IndexBuildResult *result;
118         double          reltuples;
119         GISTBuildState buildstate;
120         Buffer          buffer;
121         Page            page;
122         MemoryContext oldcxt = CurrentMemoryContext;
123         int                     fillfactor;
124
125         buildstate.indexrel = index;
126         if (index->rd_options)
127         {
128                 /* Get buffering mode from the options string */
129                 GiSTOptions *options = (GiSTOptions *) index->rd_options;
130                 char       *bufferingMode = (char *) options + options->bufferingModeOffset;
131
132                 if (strcmp(bufferingMode, "on") == 0)
133                         buildstate.bufferingMode = GIST_BUFFERING_STATS;
134                 else if (strcmp(bufferingMode, "off") == 0)
135                         buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
136                 else
137                         buildstate.bufferingMode = GIST_BUFFERING_AUTO;
138
139                 fillfactor = options->fillfactor;
140         }
141         else
142         {
143                 /*
144                  * By default, switch to buffering mode when the index grows too large
145                  * to fit in cache.
146                  */
147                 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
148                 fillfactor = GIST_DEFAULT_FILLFACTOR;
149         }
150         /* Calculate target amount of free space to leave on pages */
151         buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
152
153         /*
154          * We expect to be called exactly once for any index relation. If that's
155          * not the case, big trouble's what we have.
156          */
157         if (RelationGetNumberOfBlocks(index) != 0)
158                 elog(ERROR, "index \"%s\" already contains data",
159                          RelationGetRelationName(index));
160
161         /*
162          * We can't yet handle unlogged GiST indexes, because we depend on LSNs.
163          * This is duplicative of an error in gistbuildempty, but we want to check
164          * here so as to throw error before doing all the index-build work.
165          */
166         if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
167                 ereport(ERROR,
168                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
169                                  errmsg("unlogged GiST indexes are not supported")));
170
171         /* no locking is needed */
172         buildstate.giststate = initGISTstate(index);
173
174         /*
175          * Create a temporary memory context that is reset once for each tuple
176          * processed.  (Note: we don't bother to make this a child of the
177          * giststate's scanCxt, so we have to delete it separately at the end.)
178          */
179         buildstate.giststate->tempCxt = createTempGistContext();
180
181         /* initialize the root page */
182         buffer = gistNewBuffer(index);
183         Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
184         page = BufferGetPage(buffer);
185
186         START_CRIT_SECTION();
187
188         GISTInitBuffer(buffer, F_LEAF);
189
190         MarkBufferDirty(buffer);
191
192         if (RelationNeedsWAL(index))
193         {
194                 XLogRecPtr      recptr;
195                 XLogRecData rdata;
196
197                 rdata.data = (char *) &(index->rd_node);
198                 rdata.len = sizeof(RelFileNode);
199                 rdata.buffer = InvalidBuffer;
200                 rdata.next = NULL;
201
202                 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
203                 PageSetLSN(page, recptr);
204                 PageSetTLI(page, ThisTimeLineID);
205         }
206         else
207                 PageSetLSN(page, GetXLogRecPtrForTemp());
208
209         UnlockReleaseBuffer(buffer);
210
211         END_CRIT_SECTION();
212
213         /* build the index */
214         buildstate.indtuples = 0;
215         buildstate.indtuplesSize = 0;
216
217         /*
218          * Do the heap scan.
219          */
220         reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
221                                                                    gistBuildCallback, (void *) &buildstate);
222
223         /*
224          * If buffering was used, flush out all the tuples that are still in the
225          * buffers.
226          */
227         if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
228         {
229                 elog(DEBUG1, "all tuples processed, emptying buffers");
230                 gistEmptyAllBuffers(&buildstate);
231                 gistFreeBuildBuffers(buildstate.gfbb);
232         }
233
234         /* okay, all heap tuples are indexed */
235         MemoryContextSwitchTo(oldcxt);
236         MemoryContextDelete(buildstate.giststate->tempCxt);
237
238         freeGISTstate(buildstate.giststate);
239
240         /*
241          * Return statistics
242          */
243         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
244
245         result->heap_tuples = reltuples;
246         result->index_tuples = (double) buildstate.indtuples;
247
248         PG_RETURN_POINTER(result);
249 }
250
251 /*
252  * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
253  * and "auto" values.
254  */
255 void
256 gistValidateBufferingOption(char *value)
257 {
258         if (value == NULL ||
259                 (strcmp(value, "on") != 0 &&
260                  strcmp(value, "off") != 0 &&
261                  strcmp(value, "auto") != 0))
262         {
263                 ereport(ERROR,
264                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
265                                  errmsg("invalid value for \"buffering\" option"),
266                           errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
267         }
268 }
269
270 /*
271  * Attempt to switch to buffering mode.
272  *
273  * If there is not enough memory for buffering build, sets bufferingMode
274  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
275  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
276  * GIST_BUFFERING_ACTIVE.
277  */
278 static void
279 gistInitBuffering(GISTBuildState *buildstate)
280 {
281         Relation        index = buildstate->indexrel;
282         int                     pagesPerBuffer;
283         Size            pageFreeSpace;
284         Size            itupAvgSize,
285                                 itupMinSize;
286         double          avgIndexTuplesPerPage,
287                                 maxIndexTuplesPerPage;
288         int                     i;
289         int                     levelStep;
290
291         /* Calc space of index page which is available for index tuples */
292         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
293                 - sizeof(ItemIdData)
294                 - buildstate->freespace;
295
296         /*
297          * Calculate average size of already inserted index tuples using gathered
298          * statistics.
299          */
300         itupAvgSize = (double) buildstate->indtuplesSize /
301                 (double) buildstate->indtuples;
302
303         /*
304          * Calculate minimal possible size of index tuple by index metadata.
305          * Minimal possible size of varlena is VARHDRSZ.
306          *
307          * XXX: that's not actually true, as a short varlen can be just 2 bytes.
308          * And we should take padding into account here.
309          */
310         itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
311         for (i = 0; i < index->rd_att->natts; i++)
312         {
313                 if (index->rd_att->attrs[i]->attlen < 0)
314                         itupMinSize += VARHDRSZ;
315                 else
316                         itupMinSize += index->rd_att->attrs[i]->attlen;
317         }
318
319         /* Calculate average and maximal number of index tuples which fit to page */
320         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
321         maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
322
323         /*
324          * We need to calculate two parameters for the buffering algorithm:
325          * levelStep and pagesPerBuffer.
326          *
327          * levelStep determines the size of subtree that we operate on, while
328          * emptying a buffer. A higher value is better, as you need fewer buffer
329          * emptying steps to build the index. However, if you set it too high, the
330          * subtree doesn't fit in cache anymore, and you quickly lose the benefit
331          * of the buffers.
332          *
333          * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
334          * the number of tuples on page (ie. fanout), and M is the amount of
335          * internal memory available. Curiously, they doesn't explain *why* that
336          * setting is optimal. We calculate it by taking the highest levelStep so
337          * that a subtree still fits in cache. For a small B, our way of
338          * calculating levelStep is very close to Arge et al's formula. For a
339          * large B, our formula gives a value that is 2x higher.
340          *
341          * The average size (in pages) of a subtree of depth n can be calculated
342          * as a geometric series:
343          *
344          * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
345          *
346          * where B is the average number of index tuples on page. The subtree is
347          * cached in the shared buffer cache and the OS cache, so we choose
348          * levelStep so that the subtree size is comfortably smaller than
349          * effective_cache_size, with a safety factor of 4.
350          *
351          * The estimate on the average number of index tuples on page is based on
352          * average tuple sizes observed before switching to buffered build, so the
353          * real subtree size can be somewhat larger. Also, it would selfish to
354          * gobble the whole cache for our index build. The safety factor of 4
355          * should account for those effects.
356          *
357          * The other limiting factor for setting levelStep is that while
358          * processing a subtree, we need to hold one page for each buffer at the
359          * next lower buffered level. The max. number of buffers needed for that
360          * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
361          * hopefully maintenance_work_mem is set high enough that you're
362          * constrained by effective_cache_size rather than maintenance_work_mem.
363          *
364          * XXX: the buffer hash table consumes a fair amount of memory too per
365          * buffer, but that is not currently taken into account. That scales on
366          * the total number of buffers used, ie. the index size and on levelStep.
367          * Note that a higher levelStep *reduces* the amount of memory needed for
368          * the hash table.
369          */
370         levelStep = 1;
371         for (;;)
372         {
373                 double          subtreesize;
374                 double          maxlowestlevelpages;
375
376                 /* size of an average subtree at this levelStep (in pages). */
377                 subtreesize =
378                         (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
379                         (1 - avgIndexTuplesPerPage);
380
381                 /* max number of pages at the lowest level of a subtree */
382                 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
383
384                 /* subtree must fit in cache (with safety factor of 4) */
385                 if (subtreesize > effective_cache_size / 4)
386                         break;
387
388                 /* each node in the lowest level of a subtree has one page in memory */
389                 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
390                         break;
391
392                 /* Good, we can handle this levelStep. See if we can go one higher. */
393                 levelStep++;
394         }
395
396         /*
397          * We just reached an unacceptable value of levelStep in previous loop.
398          * So, decrease levelStep to get last acceptable value.
399          */
400         levelStep--;
401
402         /*
403          * If there's not enough cache or maintenance_work_mem, fall back to plain
404          * inserts.
405          */
406         if (levelStep <= 0)
407         {
408                 elog(DEBUG1, "failed to switch to buffered GiST build");
409                 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
410                 return;
411         }
412
413         /*
414          * The second parameter to set is pagesPerBuffer, which determines the
415          * size of each buffer. We adjust pagesPerBuffer also during the build,
416          * which is why this calculation is in a separate function.
417          */
418         pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
419
420         /* Initialize GISTBuildBuffers with these parameters */
421         buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
422                                                                                         gistGetMaxLevel(index));
423
424         gistInitParentMap(buildstate);
425
426         buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
427
428         elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
429                  levelStep, pagesPerBuffer);
430 }
431
432 /*
433  * Calculate pagesPerBuffer parameter for the buffering algorithm.
434  *
435  * Buffer size is chosen so that assuming that tuples are distributed
436  * randomly, emptying half a buffer fills on average one page in every buffer
437  * at the next lower level.
438  */
439 static int
440 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
441 {
442         double          pagesPerBuffer;
443         double          avgIndexTuplesPerPage;
444         double          itupAvgSize;
445         Size            pageFreeSpace;
446
447         /* Calc space of index page which is available for index tuples */
448         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
449                 - sizeof(ItemIdData)
450                 - buildstate->freespace;
451
452         /*
453          * Calculate average size of already inserted index tuples using gathered
454          * statistics.
455          */
456         itupAvgSize = (double) buildstate->indtuplesSize /
457                 (double) buildstate->indtuples;
458
459         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
460
461         /*
462          * Recalculate required size of buffers.
463          */
464         pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
465
466         return (int) rint(pagesPerBuffer);
467 }
468
469 /*
470  * Per-tuple callback from IndexBuildHeapScan.
471  */
472 static void
473 gistBuildCallback(Relation index,
474                                   HeapTuple htup,
475                                   Datum *values,
476                                   bool *isnull,
477                                   bool tupleIsAlive,
478                                   void *state)
479 {
480         GISTBuildState *buildstate = (GISTBuildState *) state;
481         IndexTuple      itup;
482         MemoryContext oldCtx;
483
484         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
485
486         /* form an index tuple and point it at the heap tuple */
487         itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
488         itup->t_tid = htup->t_self;
489
490         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
491         {
492                 /* We have buffers, so use them. */
493                 gistBufferingBuildInsert(buildstate, itup);
494         }
495         else
496         {
497                 /*
498                  * There's no buffers (yet). Since we already have the index relation
499                  * locked, we call gistdoinsert directly.
500                  */
501                 gistdoinsert(index, itup, buildstate->freespace,
502                                          buildstate->giststate);
503         }
504
505         /* Update tuple count and total size. */
506         buildstate->indtuples += 1;
507         buildstate->indtuplesSize += IndexTupleSize(itup);
508
509         MemoryContextSwitchTo(oldCtx);
510         MemoryContextReset(buildstate->giststate->tempCxt);
511
512         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
513                 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
514         {
515                 /* Adjust the target buffer size now */
516                 buildstate->gfbb->pagesPerBuffer =
517                         calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
518         }
519
520         /*
521          * In 'auto' mode, check if the index has grown too large to fit in cache,
522          * and switch to buffering mode if it has.
523          *
524          * To avoid excessive calls to smgrnblocks(), only check this every
525          * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
526          */
527         if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
528                  buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
529                  effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
530                 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
531                  buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
532         {
533                 /*
534                  * Index doesn't fit in effective cache anymore. Try to switch to
535                  * buffering build mode.
536                  */
537                 gistInitBuffering(buildstate);
538         }
539 }
540
541 /*
542  * Insert function for buffering index build.
543  */
544 static void
545 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
546 {
547         /* Insert the tuple to buffers. */
548         gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
549
550         /* If we filled up (half of a) buffer, process buffer emptying. */
551         gistProcessEmptyingQueue(buildstate);
552 }
553
554 /*
555  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
556  * page or node buffer, and inserts the tuple there. Returns true if we have
557  * to stop buffer emptying process (because one of child buffers can't take
558  * index tuples anymore).
559  */
560 static bool
561 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
562                                 BlockNumber startblkno, int startlevel)
563 {
564         GISTSTATE  *giststate = buildstate->giststate;
565         GISTBuildBuffers *gfbb = buildstate->gfbb;
566         Relation        indexrel = buildstate->indexrel;
567         BlockNumber childblkno;
568         Buffer          buffer;
569         bool            result = false;
570         BlockNumber blkno;
571         int                     level;
572         OffsetNumber downlinkoffnum = InvalidOffsetNumber;
573         BlockNumber parentblkno = InvalidBlockNumber;
574
575         CHECK_FOR_INTERRUPTS();
576
577         /*
578          * Loop until we reach a leaf page (level == 0) or a level with buffers
579          * (not including the level we start at, because we would otherwise make
580          * no progress).
581          */
582         blkno = startblkno;
583         level = startlevel;
584         for (;;)
585         {
586                 ItemId          iid;
587                 IndexTuple      idxtuple,
588                                         newtup;
589                 Page            page;
590                 OffsetNumber childoffnum;
591
592                 /* Have we reached a level with buffers? */
593                 if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
594                         break;
595
596                 /* Have we reached a leaf page? */
597                 if (level == 0)
598                         break;
599
600                 /*
601                  * Nope. Descend down to the next level then. Choose a child to
602                  * descend down to.
603                  */
604
605                 buffer = ReadBuffer(indexrel, blkno);
606                 LockBuffer(buffer, GIST_EXCLUSIVE);
607
608                 page = (Page) BufferGetPage(buffer);
609                 childoffnum = gistchoose(indexrel, page, itup, giststate);
610                 iid = PageGetItemId(page, childoffnum);
611                 idxtuple = (IndexTuple) PageGetItem(page, iid);
612                 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
613
614                 if (level > 1)
615                         gistMemorizeParent(buildstate, childblkno, blkno);
616
617                 /*
618                  * Check that the key representing the target child node is consistent
619                  * with the key we're inserting. Update it if it's not.
620                  */
621                 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
622                 if (newtup)
623                 {
624                         gistbufferinginserttuples(buildstate, buffer, level,
625                                                                           &newtup, 1, childoffnum,
626                                                                         InvalidBlockNumber, InvalidOffsetNumber);
627                         /* gistbufferinginserttuples() released the buffer */
628                 }
629                 else
630                         UnlockReleaseBuffer(buffer);
631
632                 /* Descend to the child */
633                 parentblkno = blkno;
634                 blkno = childblkno;
635                 downlinkoffnum = childoffnum;
636                 Assert(level > 0);
637                 level--;
638         }
639
640         if (LEVEL_HAS_BUFFERS(level, gfbb))
641         {
642                 /*
643                  * We've reached level with buffers. Place the index tuple to the
644                  * buffer, and add the buffer to the emptying queue if it overflows.
645                  */
646                 GISTNodeBuffer *childNodeBuffer;
647
648                 /* Find the buffer or create a new one */
649                 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
650
651                 /* Add index tuple to it */
652                 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
653
654                 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
655                         result = true;
656         }
657         else
658         {
659                 /*
660                  * We've reached a leaf page. Place the tuple here.
661                  */
662                 Assert(level == 0);
663                 buffer = ReadBuffer(indexrel, blkno);
664                 LockBuffer(buffer, GIST_EXCLUSIVE);
665                 gistbufferinginserttuples(buildstate, buffer, level,
666                                                                   &itup, 1, InvalidOffsetNumber,
667                                                                   parentblkno, downlinkoffnum);
668                 /* gistbufferinginserttuples() released the buffer */
669         }
670
671         return result;
672 }
673
674 /*
675  * Insert tuples to a given page.
676  *
677  * This is analogous with gistinserttuples() in the regular insertion code.
678  *
679  * Caller should hold a lock on 'buffer' on entry. This function will unlock
680  * and unpin it.
681  */
682 static void
683 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
684                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
685                                                   BlockNumber parentblk, OffsetNumber downlinkoffnum)
686 {
687         GISTBuildBuffers *gfbb = buildstate->gfbb;
688         List       *splitinfo;
689         bool            is_split;
690
691         is_split = gistplacetopage(buildstate->indexrel,
692                                                            buildstate->freespace,
693                                                            buildstate->giststate,
694                                                            buffer,
695                                                            itup, ntup, oldoffnum,
696                                                            InvalidBuffer,
697                                                            &splitinfo,
698                                                            false);
699
700         /*
701          * If this is a root split, update the root path item kept in memory. This
702          * ensures that all path stacks are always complete, including all parent
703          * nodes up to the root. That simplifies the algorithm to re-find correct
704          * parent.
705          */
706         if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
707         {
708                 Page            page = BufferGetPage(buffer);
709                 OffsetNumber off;
710                 OffsetNumber maxoff;
711
712                 Assert(level == gfbb->rootlevel);
713                 gfbb->rootlevel++;
714
715                 elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
716
717                 /*
718                  * All the downlinks on the old root page are now on one of the child
719                  * pages. Visit all the new child pages to memorize the parents of the
720                  * grandchildren.
721                  */
722                 if (gfbb->rootlevel > 1)
723                 {
724                         maxoff = PageGetMaxOffsetNumber(page);
725                         for (off = FirstOffsetNumber; off <= maxoff; off++)
726                         {
727                                 ItemId          iid = PageGetItemId(page, off);
728                                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
729                                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
730                                 Buffer          childbuf = ReadBuffer(buildstate->indexrel, childblkno);
731
732                                 LockBuffer(childbuf, GIST_SHARE);
733                                 gistMemorizeAllDownlinks(buildstate, childbuf);
734                                 UnlockReleaseBuffer(childbuf);
735
736                                 /*
737                                  * Also remember that the parent of the new child page is the
738                                  * root block.
739                                  */
740                                 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
741                         }
742                 }
743         }
744
745         if (splitinfo)
746         {
747                 /*
748                  * Insert the downlinks to the parent. This is analogous with
749                  * gistfinishsplit() in the regular insertion code, but the locking is
750                  * simpler, and we have to maintain the buffers on internal nodes and
751                  * the parent map.
752                  */
753                 IndexTuple *downlinks;
754                 int                     ndownlinks,
755                                         i;
756                 Buffer          parentBuffer;
757                 ListCell   *lc;
758
759                 /* Parent may have changed since we memorized this path. */
760                 parentBuffer =
761                         gistBufferingFindCorrectParent(buildstate,
762                                                                                    BufferGetBlockNumber(buffer),
763                                                                                    level,
764                                                                                    &parentblk,
765                                                                                    &downlinkoffnum);
766
767                 /*
768                  * If there's a buffer associated with this page, that needs to be
769                  * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
770                  * downlinks in 'splitinfo', to make sure they're consistent not only
771                  * with the tuples already on the pages, but also the tuples in the
772                  * buffers that will eventually be inserted to them.
773                  */
774                 gistRelocateBuildBuffersOnSplit(gfbb,
775                                                                                 buildstate->giststate,
776                                                                                 buildstate->indexrel,
777                                                                                 level,
778                                                                                 buffer, splitinfo);
779
780                 /* Create an array of all the downlink tuples */
781                 ndownlinks = list_length(splitinfo);
782                 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
783                 i = 0;
784                 foreach(lc, splitinfo)
785                 {
786                         GISTPageSplitInfo *splitinfo = lfirst(lc);
787
788                         /*
789                          * Remember the parent of each new child page in our parent map.
790                          * This assumes that the downlinks fit on the parent page. If the
791                          * parent page is split, too, when we recurse up to insert the
792                          * downlinks, the recursive gistbufferinginserttuples() call will
793                          * update the map again.
794                          */
795                         if (level > 0)
796                                 gistMemorizeParent(buildstate,
797                                                                    BufferGetBlockNumber(splitinfo->buf),
798                                                                    BufferGetBlockNumber(parentBuffer));
799
800                         /*
801                          * Also update the parent map for all the downlinks that got moved
802                          * to a different page. (actually this also loops through the
803                          * downlinks that stayed on the original page, but it does no
804                          * harm).
805                          */
806                         if (level > 1)
807                                 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
808
809                         /*
810                          * Since there's no concurrent access, we can release the lower
811                          * level buffers immediately. This includes the original page.
812                          */
813                         UnlockReleaseBuffer(splitinfo->buf);
814                         downlinks[i++] = splitinfo->downlink;
815                 }
816
817                 /* Insert them into parent. */
818                 gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
819                                                                   downlinks, ndownlinks, downlinkoffnum,
820                                                                   InvalidBlockNumber, InvalidOffsetNumber);
821
822                 list_free_deep(splitinfo);              /* we don't need this anymore */
823         }
824         else
825                 UnlockReleaseBuffer(buffer);
826 }
827
828 /*
829  * Find the downlink pointing to a child page.
830  *
831  * 'childblkno' indicates the child page to find the parent for. 'level' is
832  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
833  * point to a location where the downlink used to be - we will check that
834  * location first, and save some cycles if it hasn't moved. The function
835  * returns a buffer containing the downlink, exclusively-locked, and
836  * *parentblkno and *downlinkoffnum are set to the real location of the
837  * downlink.
838  *
839  * If the child page is a leaf (level == 0), the caller must supply a correct
840  * parentblkno. Otherwise we use the parent map hash table to find the parent
841  * block.
842  *
843  * This function serves the same purpose as gistFindCorrectParent() during
844  * normal index inserts, but this is simpler because we don't need to deal
845  * with concurrent inserts.
846  */
847 static Buffer
848 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
849                                                            BlockNumber childblkno, int level,
850                                                            BlockNumber *parentblkno,
851                                                            OffsetNumber *downlinkoffnum)
852 {
853         BlockNumber parent;
854         Buffer          buffer;
855         Page            page;
856         OffsetNumber maxoff;
857         OffsetNumber off;
858
859         if (level > 0)
860                 parent = gistGetParent(buildstate, childblkno);
861         else
862         {
863                 /*
864                  * For a leaf page, the caller must supply a correct parent block
865                  * number.
866                  */
867                 if (*parentblkno == InvalidBlockNumber)
868                         elog(ERROR, "no parent buffer provided of child %d", childblkno);
869                 parent = *parentblkno;
870         }
871
872         buffer = ReadBuffer(buildstate->indexrel, parent);
873         page = BufferGetPage(buffer);
874         LockBuffer(buffer, GIST_EXCLUSIVE);
875         gistcheckpage(buildstate->indexrel, buffer);
876         maxoff = PageGetMaxOffsetNumber(page);
877
878         /* Check if it was not moved */
879         if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
880                 *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
881         {
882                 ItemId          iid = PageGetItemId(page, *downlinkoffnum);
883                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
884
885                 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
886                 {
887                         /* Still there */
888                         return buffer;
889                 }
890         }
891
892         /*
893          * Downlink was not at the offset where it used to be. Scan the page to
894          * find it. During normal gist insertions, it might've moved to another
895          * page, to the right, but during a buffering build, we keep track of the
896          * parent of each page in the lookup table so we should always know what
897          * page it's on.
898          */
899         for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
900         {
901                 ItemId          iid = PageGetItemId(page, off);
902                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
903
904                 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
905                 {
906                         /* yes!!, found it */
907                         *downlinkoffnum = off;
908                         return buffer;
909                 }
910         }
911
912         elog(ERROR, "failed to re-find parent for block %u", childblkno);
913         return InvalidBuffer;           /* keep compiler quiet */
914 }
915
916 /*
917  * Process buffers emptying stack. Emptying of one buffer can cause emptying
918  * of other buffers. This function iterates until this cascading emptying
919  * process finished, e.g. until buffers emptying stack is empty.
920  */
921 static void
922 gistProcessEmptyingQueue(GISTBuildState *buildstate)
923 {
924         GISTBuildBuffers *gfbb = buildstate->gfbb;
925
926         /* Iterate while we have elements in buffers emptying stack. */
927         while (gfbb->bufferEmptyingQueue != NIL)
928         {
929                 GISTNodeBuffer *emptyingNodeBuffer;
930
931                 /* Get node buffer from emptying stack. */
932                 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
933                 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
934                 emptyingNodeBuffer->queuedForEmptying = false;
935
936                 /*
937                  * We are going to load last pages of buffers where emptying will be
938                  * to. So let's unload any previously loaded buffers.
939                  */
940                 gistUnloadNodeBuffers(gfbb);
941
942                 /*
943                  * Pop tuples from the buffer and run them down to the buffers at
944                  * lower level, or leaf pages. We continue until one of the lower
945                  * level buffers fills up, or this buffer runs empty.
946                  *
947                  * In Arge et al's paper, the buffer emptying is stopped after
948                  * processing 1/2 node buffer worth of tuples, to avoid overfilling
949                  * any of the lower level buffers. However, it's more efficient to
950                  * keep going until one of the lower level buffers actually fills up,
951                  * so that's what we do. This doesn't need to be exact, if a buffer
952                  * overfills by a few tuples, there's no harm done.
953                  */
954                 while (true)
955                 {
956                         IndexTuple      itup;
957
958                         /* Get next index tuple from the buffer */
959                         if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
960                                 break;
961
962                         /*
963                          * Run it down to the underlying node buffer or leaf page.
964                          *
965                          * Note: it's possible that the buffer we're emptying splits as a
966                          * result of this call. If that happens, our emptyingNodeBuffer
967                          * points to the left half of the split. After split, it's very
968                          * likely that the new left buffer is no longer over the half-full
969                          * threshold, but we might as well keep flushing tuples from it
970                          * until we fill a lower-level buffer.
971                          */
972                         if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
973                         {
974                                 /*
975                                  * A lower level buffer filled up. Stop emptying this buffer,
976                                  * to avoid overflowing the lower level buffer.
977                                  */
978                                 break;
979                         }
980
981                         /* Free all the memory allocated during index tuple processing */
982                         MemoryContextReset(buildstate->giststate->tempCxt);
983                 }
984         }
985 }
986
987 /*
988  * Empty all node buffers, from top to bottom. This is done at the end of
989  * index build to flush all remaining tuples to the index.
990  *
991  * Note: This destroys the buffersOnLevels lists, so the buffers should not
992  * be inserted to after this call.
993  */
994 static void
995 gistEmptyAllBuffers(GISTBuildState *buildstate)
996 {
997         GISTBuildBuffers *gfbb = buildstate->gfbb;
998         MemoryContext oldCtx;
999         int                     i;
1000
1001         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1002
1003         /*
1004          * Iterate through the levels from top to bottom.
1005          */
1006         for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1007         {
1008                 /*
1009                  * Empty all buffers on this level. Note that new buffers can pop up
1010                  * in the list during the processing, as a result of page splits, so a
1011                  * simple walk through the list won't work. We remove buffers from the
1012                  * list when we see them empty; a buffer can't become non-empty once
1013                  * it's been fully emptied.
1014                  */
1015                 while (gfbb->buffersOnLevels[i] != NIL)
1016                 {
1017                         GISTNodeBuffer *nodeBuffer;
1018
1019                         nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1020
1021                         if (nodeBuffer->blocksCount != 0)
1022                         {
1023                                 /*
1024                                  * Add this buffer to the emptying queue, and proceed to empty
1025                                  * the queue.
1026                                  */
1027                                 if (!nodeBuffer->queuedForEmptying)
1028                                 {
1029                                         MemoryContextSwitchTo(gfbb->context);
1030                                         nodeBuffer->queuedForEmptying = true;
1031                                         gfbb->bufferEmptyingQueue =
1032                                                 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1033                                         MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1034                                 }
1035                                 gistProcessEmptyingQueue(buildstate);
1036                         }
1037                         else
1038                                 gfbb->buffersOnLevels[i] =
1039                                         list_delete_first(gfbb->buffersOnLevels[i]);
1040                 }
1041                 elog(DEBUG2, "emptied all buffers at level %d", i);
1042         }
1043         MemoryContextSwitchTo(oldCtx);
1044 }
1045
1046 /*
1047  * Get the depth of the GiST index.
1048  */
1049 static int
1050 gistGetMaxLevel(Relation index)
1051 {
1052         int                     maxLevel;
1053         BlockNumber blkno;
1054
1055         /*
1056          * Traverse down the tree, starting from the root, until we hit the leaf
1057          * level.
1058          */
1059         maxLevel = 0;
1060         blkno = GIST_ROOT_BLKNO;
1061         while (true)
1062         {
1063                 Buffer          buffer;
1064                 Page            page;
1065                 IndexTuple      itup;
1066
1067                 buffer = ReadBuffer(index, blkno);
1068
1069                 /*
1070                  * There's no concurrent access during index build, so locking is just
1071                  * pro forma.
1072                  */
1073                 LockBuffer(buffer, GIST_SHARE);
1074                 page = (Page) BufferGetPage(buffer);
1075
1076                 if (GistPageIsLeaf(page))
1077                 {
1078                         /* We hit the bottom, so we're done. */
1079                         UnlockReleaseBuffer(buffer);
1080                         break;
1081                 }
1082
1083                 /*
1084                  * Pick the first downlink on the page, and follow it. It doesn't
1085                  * matter which downlink we choose, the tree has the same depth
1086                  * everywhere, so we just pick the first one.
1087                  */
1088                 itup = (IndexTuple) PageGetItem(page,
1089                                                                          PageGetItemId(page, FirstOffsetNumber));
1090                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1091                 UnlockReleaseBuffer(buffer);
1092
1093                 /*
1094                  * We're going down on the tree. It means that there is yet one more
1095                  * level in the tree.
1096                  */
1097                 maxLevel++;
1098         }
1099         return maxLevel;
1100 }
1101
1102
1103 /*
1104  * Routines for managing the parent map.
1105  *
1106  * Whenever a page is split, we need to insert the downlinks into the parent.
1107  * We need to somehow find the parent page to do that. In normal insertions,
1108  * we keep a stack of nodes visited when we descend the tree. However, in
1109  * buffering build, we can start descending the tree from any internal node,
1110  * when we empty a buffer by cascading tuples to its children. So we don't
1111  * have a full stack up to the root available at that time.
1112  *
1113  * So instead, we maintain a hash table to track the parent of every internal
1114  * page. We don't need to track the parents of leaf nodes, however. Whenever
1115  * we insert to a leaf, we've just descended down from its parent, so we know
1116  * its immediate parent already. This helps a lot to limit the memory used
1117  * by this hash table.
1118  *
1119  * Whenever an internal node is split, the parent map needs to be updated.
1120  * the parent of the new child page needs to be recorded, and also the
1121  * entries for all page whose downlinks are moved to a new page at the split
1122  * needs to be updated.
1123  *
1124  * We also update the parent map whenever we descend the tree. That might seem
1125  * unnecessary, because we maintain the map whenever a downlink is moved or
1126  * created, but it is needed because we switch to buffering mode after
1127  * creating a tree with regular index inserts. Any pages created before
1128  * switching to buffering mode will not be present in the parent map initially,
1129  * but will be added there the first time we visit them.
1130  */
1131
1132 typedef struct
1133 {
1134         BlockNumber childblkno;         /* hash key */
1135         BlockNumber parentblkno;
1136 } ParentMapEntry;
1137
1138 static void
1139 gistInitParentMap(GISTBuildState *buildstate)
1140 {
1141         HASHCTL         hashCtl;
1142
1143         hashCtl.keysize = sizeof(BlockNumber);
1144         hashCtl.entrysize = sizeof(ParentMapEntry);
1145         hashCtl.hcxt = CurrentMemoryContext;
1146         hashCtl.hash = oid_hash;
1147         buildstate->parentMap = hash_create("gistbuild parent map",
1148                                                                                 1024,
1149                                                                                 &hashCtl,
1150                                                                                 HASH_ELEM | HASH_CONTEXT
1151                                                                                 | HASH_FUNCTION);
1152 }
1153
1154 static void
1155 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1156 {
1157         ParentMapEntry *entry;
1158         bool            found;
1159
1160         entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1161                                                                                    (const void *) &child,
1162                                                                                    HASH_ENTER,
1163                                                                                    &found);
1164         entry->parentblkno = parent;
1165 }
1166
1167 /*
1168  * Scan all downlinks on a page, and memorize their parent.
1169  */
1170 static void
1171 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1172 {
1173         OffsetNumber maxoff;
1174         OffsetNumber off;
1175         BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1176         Page            page = BufferGetPage(parentbuf);
1177
1178         Assert(!GistPageIsLeaf(page));
1179
1180         maxoff = PageGetMaxOffsetNumber(page);
1181         for (off = FirstOffsetNumber; off <= maxoff; off++)
1182         {
1183                 ItemId          iid = PageGetItemId(page, off);
1184                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
1185                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1186
1187                 gistMemorizeParent(buildstate, childblkno, parentblkno);
1188         }
1189 }
1190
1191 static BlockNumber
1192 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1193 {
1194         ParentMapEntry *entry;
1195         bool            found;
1196
1197         /* Find node buffer in hash table */
1198         entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1199                                                                                    (const void *) &child,
1200                                                                                    HASH_FIND,
1201                                                                                    &found);
1202         if (!found)
1203                 elog(ERROR, "could not find parent of block %d in lookup table", child);
1204
1205         return entry->parentblkno;
1206 }