granicus.if.org Git - postgresql/blob - src/backend/access/gist/gistbuild.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * gistbuild.c
   4  *        build algorithm for GiST indexes implementation.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/gist/gistbuild.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <math.h>
  18
  19 #include "access/genam.h"
  20 #include "access/gist_private.h"
  21 #include "catalog/index.h"
  22 #include "miscadmin.h"
  23 #include "optimizer/cost.h"
  24 #include "storage/bufmgr.h"
  25 #include "storage/smgr.h"
  26 #include "utils/memutils.h"
  27 #include "utils/rel.h"
  28
  29 /* Step of index tuples for check whether to switch to buffering build mode */
  30 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
  31
  32 /*
  33  * Number of tuples to process in the slow way before switching to buffering
  34  * mode, when buffering is explicitly turned on. Also, the number of tuples
  35  * to process between readjusting the buffer size parameter, while in
  36  * buffering mode.
  37  */
  38 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
  39
  40 typedef enum
  41 {
  42         GIST_BUFFERING_DISABLED,        /* in regular build mode and aren't going to
  43                                                                  * switch */
  44         GIST_BUFFERING_AUTO,            /* in regular build mode, but will switch to
  45                                                                  * buffering build mode if the index grows too
  46                                                                  * big */
  47         GIST_BUFFERING_STATS,           /* gathering statistics of index tuple size
  48                                                                  * before switching to the buffering build
  49                                                                  * mode */
  50         GIST_BUFFERING_ACTIVE           /* in buffering build mode */
  51 } GistBufferingMode;
  52
  53 /* Working state for gistbuild and its callback */
  54 typedef struct
  55 {
  56         Relation        indexrel;
  57         GISTSTATE  *giststate;
  58
  59         int64           indtuples;              /* number of tuples indexed */
  60         int64           indtuplesSize;  /* total size of all indexed tuples */
  61
  62         Size            freespace;              /* amount of free space to leave on pages */
  63
  64         /*
  65          * Extra data structures used during a buffering build. 'gfbb' contains
  66          * information related to managing the build buffers. 'parentMap' is a
  67          * lookup table of the parent of each internal page.
  68          */
  69         GISTBuildBuffers *gfbb;
  70         HTAB       *parentMap;
  71
  72         GistBufferingMode bufferingMode;
  73 } GISTBuildState;
  74
  75 /* prototypes for private functions */
  76 static void gistInitBuffering(GISTBuildState *buildstate);
  77 static int      calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
  78 static void gistBuildCallback(Relation index,
  79                                   HeapTuple htup,
  80                                   Datum *values,
  81                                   bool *isnull,
  82                                   bool tupleIsAlive,
  83                                   void *state);
  84 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
  85                                                  IndexTuple itup);
  86 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
  87                                 BlockNumber startblkno, int startlevel);
  88 static void gistbufferinginserttuples(GISTBuildState *buildstate,
  89                                                   Buffer buffer, int level,
  90                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
  91                                                   BlockNumber parentblk, OffsetNumber downlinkoffnum);
  92 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
  93                                                            BlockNumber childblkno, int level,
  94                                                            BlockNumber *parentblk,
  95                                                            OffsetNumber *downlinkoffnum);
  96 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
  97 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
  98 static int      gistGetMaxLevel(Relation index);
  99
 100 static void gistInitParentMap(GISTBuildState *buildstate);
 101 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
 102                                    BlockNumber parent);
 103 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
 104 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
 105
 106 /*
 107  * Main entry point to GiST index build. Initially calls insert over and over,
 108  * but switches to more efficient buffering build algorithm after a certain
 109  * number of tuples (unless buffering mode is disabled).
 110  */
 111 Datum
 112 gistbuild(PG_FUNCTION_ARGS)
 113 {
 114         Relation        heap = (Relation) PG_GETARG_POINTER(0);
 115         Relation        index = (Relation) PG_GETARG_POINTER(1);
 116         IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
 117         IndexBuildResult *result;
 118         double          reltuples;
 119         GISTBuildState buildstate;
 120         Buffer          buffer;
 121         Page            page;
 122         MemoryContext oldcxt = CurrentMemoryContext;
 123         int                     fillfactor;
 124
 125         buildstate.indexrel = index;
 126         if (index->rd_options)
 127         {
 128                 /* Get buffering mode from the options string */
 129                 GiSTOptions *options = (GiSTOptions *) index->rd_options;
 130                 char       *bufferingMode = (char *) options + options->bufferingModeOffset;
 131
 132                 if (strcmp(bufferingMode, "on") == 0)
 133                         buildstate.bufferingMode = GIST_BUFFERING_STATS;
 134                 else if (strcmp(bufferingMode, "off") == 0)
 135                         buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
 136                 else
 137                         buildstate.bufferingMode = GIST_BUFFERING_AUTO;
 138
 139                 fillfactor = options->fillfactor;
 140         }
 141         else
 142         {
 143                 /*
 144                  * By default, switch to buffering mode when the index grows too large
 145                  * to fit in cache.
 146                  */
 147                 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
 148                 fillfactor = GIST_DEFAULT_FILLFACTOR;
 149         }
 150         /* Calculate target amount of free space to leave on pages */
 151         buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
 152
 153         /*
 154          * We expect to be called exactly once for any index relation. If that's
 155          * not the case, big trouble's what we have.
 156          */
 157         if (RelationGetNumberOfBlocks(index) != 0)
 158                 elog(ERROR, "index \"%s\" already contains data",
 159                          RelationGetRelationName(index));
 160
 161         /*
 162          * We can't yet handle unlogged GiST indexes, because we depend on LSNs.
 163          * This is duplicative of an error in gistbuildempty, but we want to check
 164          * here so as to throw error before doing all the index-build work.
 165          */
 166         if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
 167                 ereport(ERROR,
 168                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 169                                  errmsg("unlogged GiST indexes are not supported")));
 170
 171         /* no locking is needed */
 172         buildstate.giststate = initGISTstate(index);
 173
 174         /*
 175          * Create a temporary memory context that is reset once for each tuple
 176          * processed.  (Note: we don't bother to make this a child of the
 177          * giststate's scanCxt, so we have to delete it separately at the end.)
 178          */
 179         buildstate.giststate->tempCxt = createTempGistContext();
 180
 181         /* initialize the root page */
 182         buffer = gistNewBuffer(index);
 183         Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
 184         page = BufferGetPage(buffer);
 185
 186         START_CRIT_SECTION();
 187
 188         GISTInitBuffer(buffer, F_LEAF);
 189
 190         MarkBufferDirty(buffer);
 191
 192         if (RelationNeedsWAL(index))
 193         {
 194                 XLogRecPtr      recptr;
 195                 XLogRecData rdata;
 196
 197                 rdata.data = (char *) &(index->rd_node);
 198                 rdata.len = sizeof(RelFileNode);
 199                 rdata.buffer = InvalidBuffer;
 200                 rdata.next = NULL;
 201
 202                 recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
 203                 PageSetLSN(page, recptr);
 204                 PageSetTLI(page, ThisTimeLineID);
 205         }
 206         else
 207                 PageSetLSN(page, GetXLogRecPtrForTemp());
 208
 209         UnlockReleaseBuffer(buffer);
 210
 211         END_CRIT_SECTION();
 212
 213         /* build the index */
 214         buildstate.indtuples = 0;
 215         buildstate.indtuplesSize = 0;
 216
 217         /*
 218          * Do the heap scan.
 219          */
 220         reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
 221                                                                    gistBuildCallback, (void *) &buildstate);
 222
 223         /*
 224          * If buffering was used, flush out all the tuples that are still in the
 225          * buffers.
 226          */
 227         if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
 228         {
 229                 elog(DEBUG1, "all tuples processed, emptying buffers");
 230                 gistEmptyAllBuffers(&buildstate);
 231                 gistFreeBuildBuffers(buildstate.gfbb);
 232         }
 233
 234         /* okay, all heap tuples are indexed */
 235         MemoryContextSwitchTo(oldcxt);
 236         MemoryContextDelete(buildstate.giststate->tempCxt);
 237
 238         freeGISTstate(buildstate.giststate);
 239
 240         /*
 241          * Return statistics
 242          */
 243         result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
 244
 245         result->heap_tuples = reltuples;
 246         result->index_tuples = (double) buildstate.indtuples;
 247
 248         PG_RETURN_POINTER(result);
 249 }
 250
 251 /*
 252  * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
 253  * and "auto" values.
 254  */
 255 void
 256 gistValidateBufferingOption(char *value)
 257 {
 258         if (value == NULL ||
 259                 (strcmp(value, "on") != 0 &&
 260                  strcmp(value, "off") != 0 &&
 261                  strcmp(value, "auto") != 0))
 262         {
 263                 ereport(ERROR,
 264                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 265                                  errmsg("invalid value for \"buffering\" option"),
 266                           errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
 267         }
 268 }
 269
 270 /*
 271  * Attempt to switch to buffering mode.
 272  *
 273  * If there is not enough memory for buffering build, sets bufferingMode
 274  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
 275  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
 276  * GIST_BUFFERING_ACTIVE.
 277  */
 278 static void
 279 gistInitBuffering(GISTBuildState *buildstate)
 280 {
 281         Relation        index = buildstate->indexrel;
 282         int                     pagesPerBuffer;
 283         Size            pageFreeSpace;
 284         Size            itupAvgSize,
 285                                 itupMinSize;
 286         double          avgIndexTuplesPerPage,
 287                                 maxIndexTuplesPerPage;
 288         int                     i;
 289         int                     levelStep;
 290
 291         /* Calc space of index page which is available for index tuples */
 292         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
 293                 - sizeof(ItemIdData)
 294                 - buildstate->freespace;
 295
 296         /*
 297          * Calculate average size of already inserted index tuples using gathered
 298          * statistics.
 299          */
 300         itupAvgSize = (double) buildstate->indtuplesSize /
 301                 (double) buildstate->indtuples;
 302
 303         /*
 304          * Calculate minimal possible size of index tuple by index metadata.
 305          * Minimal possible size of varlena is VARHDRSZ.
 306          *
 307          * XXX: that's not actually true, as a short varlen can be just 2 bytes.
 308          * And we should take padding into account here.
 309          */
 310         itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
 311         for (i = 0; i < index->rd_att->natts; i++)
 312         {
 313                 if (index->rd_att->attrs[i]->attlen < 0)
 314                         itupMinSize += VARHDRSZ;
 315                 else
 316                         itupMinSize += index->rd_att->attrs[i]->attlen;
 317         }
 318
 319         /* Calculate average and maximal number of index tuples which fit to page */
 320         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
 321         maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
 322
 323         /*
 324          * We need to calculate two parameters for the buffering algorithm:
 325          * levelStep and pagesPerBuffer.
 326          *
 327          * levelStep determines the size of subtree that we operate on, while
 328          * emptying a buffer. A higher value is better, as you need fewer buffer
 329          * emptying steps to build the index. However, if you set it too high, the
 330          * subtree doesn't fit in cache anymore, and you quickly lose the benefit
 331          * of the buffers.
 332          *
 333          * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
 334          * the number of tuples on page (ie. fanout), and M is the amount of
 335          * internal memory available. Curiously, they doesn't explain *why* that
 336          * setting is optimal. We calculate it by taking the highest levelStep so
 337          * that a subtree still fits in cache. For a small B, our way of
 338          * calculating levelStep is very close to Arge et al's formula. For a
 339          * large B, our formula gives a value that is 2x higher.
 340          *
 341          * The average size (in pages) of a subtree of depth n can be calculated
 342          * as a geometric series:
 343          *
 344          * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
 345          *
 346          * where B is the average number of index tuples on page. The subtree is
 347          * cached in the shared buffer cache and the OS cache, so we choose
 348          * levelStep so that the subtree size is comfortably smaller than
 349          * effective_cache_size, with a safety factor of 4.
 350          *
 351          * The estimate on the average number of index tuples on page is based on
 352          * average tuple sizes observed before switching to buffered build, so the
 353          * real subtree size can be somewhat larger. Also, it would selfish to
 354          * gobble the whole cache for our index build. The safety factor of 4
 355          * should account for those effects.
 356          *
 357          * The other limiting factor for setting levelStep is that while
 358          * processing a subtree, we need to hold one page for each buffer at the
 359          * next lower buffered level. The max. number of buffers needed for that
 360          * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
 361          * hopefully maintenance_work_mem is set high enough that you're
 362          * constrained by effective_cache_size rather than maintenance_work_mem.
 363          *
 364          * XXX: the buffer hash table consumes a fair amount of memory too per
 365          * buffer, but that is not currently taken into account. That scales on
 366          * the total number of buffers used, ie. the index size and on levelStep.
 367          * Note that a higher levelStep *reduces* the amount of memory needed for
 368          * the hash table.
 369          */
 370         levelStep = 1;
 371         for (;;)
 372         {
 373                 double          subtreesize;
 374                 double          maxlowestlevelpages;
 375
 376                 /* size of an average subtree at this levelStep (in pages). */
 377                 subtreesize =
 378                         (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
 379                         (1 - avgIndexTuplesPerPage);
 380
 381                 /* max number of pages at the lowest level of a subtree */
 382                 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
 383
 384                 /* subtree must fit in cache (with safety factor of 4) */
 385                 if (subtreesize > effective_cache_size / 4)
 386                         break;
 387
 388                 /* each node in the lowest level of a subtree has one page in memory */
 389                 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
 390                         break;
 391
 392                 /* Good, we can handle this levelStep. See if we can go one higher. */
 393                 levelStep++;
 394         }
 395
 396         /*
 397          * We just reached an unacceptable value of levelStep in previous loop.
 398          * So, decrease levelStep to get last acceptable value.
 399          */
 400         levelStep--;
 401
 402         /*
 403          * If there's not enough cache or maintenance_work_mem, fall back to plain
 404          * inserts.
 405          */
 406         if (levelStep <= 0)
 407         {
 408                 elog(DEBUG1, "failed to switch to buffered GiST build");
 409                 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
 410                 return;
 411         }
 412
 413         /*
 414          * The second parameter to set is pagesPerBuffer, which determines the
 415          * size of each buffer. We adjust pagesPerBuffer also during the build,
 416          * which is why this calculation is in a separate function.
 417          */
 418         pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
 419
 420         /* Initialize GISTBuildBuffers with these parameters */
 421         buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
 422                                                                                         gistGetMaxLevel(index));
 423
 424         gistInitParentMap(buildstate);
 425
 426         buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
 427
 428         elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
 429                  levelStep, pagesPerBuffer);
 430 }
 431
 432 /*
 433  * Calculate pagesPerBuffer parameter for the buffering algorithm.
 434  *
 435  * Buffer size is chosen so that assuming that tuples are distributed
 436  * randomly, emptying half a buffer fills on average one page in every buffer
 437  * at the next lower level.
 438  */
 439 static int
 440 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
 441 {
 442         double          pagesPerBuffer;
 443         double          avgIndexTuplesPerPage;
 444         double          itupAvgSize;
 445         Size            pageFreeSpace;
 446
 447         /* Calc space of index page which is available for index tuples */
 448         pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
 449                 - sizeof(ItemIdData)
 450                 - buildstate->freespace;
 451
 452         /*
 453          * Calculate average size of already inserted index tuples using gathered
 454          * statistics.
 455          */
 456         itupAvgSize = (double) buildstate->indtuplesSize /
 457                 (double) buildstate->indtuples;
 458
 459         avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
 460
 461         /*
 462          * Recalculate required size of buffers.
 463          */
 464         pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
 465
 466         return (int) rint(pagesPerBuffer);
 467 }
 468
 469 /*
 470  * Per-tuple callback from IndexBuildHeapScan.
 471  */
 472 static void
 473 gistBuildCallback(Relation index,
 474                                   HeapTuple htup,
 475                                   Datum *values,
 476                                   bool *isnull,
 477                                   bool tupleIsAlive,
 478                                   void *state)
 479 {
 480         GISTBuildState *buildstate = (GISTBuildState *) state;
 481         IndexTuple      itup;
 482         MemoryContext oldCtx;
 483
 484         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
 485
 486         /* form an index tuple and point it at the heap tuple */
 487         itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
 488         itup->t_tid = htup->t_self;
 489
 490         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
 491         {
 492                 /* We have buffers, so use them. */
 493                 gistBufferingBuildInsert(buildstate, itup);
 494         }
 495         else
 496         {
 497                 /*
 498                  * There's no buffers (yet). Since we already have the index relation
 499                  * locked, we call gistdoinsert directly.
 500                  */
 501                 gistdoinsert(index, itup, buildstate->freespace,
 502                                          buildstate->giststate);
 503         }
 504
 505         /* Update tuple count and total size. */
 506         buildstate->indtuples += 1;
 507         buildstate->indtuplesSize += IndexTupleSize(itup);
 508
 509         MemoryContextSwitchTo(oldCtx);
 510         MemoryContextReset(buildstate->giststate->tempCxt);
 511
 512         if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
 513                 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
 514         {
 515                 /* Adjust the target buffer size now */
 516                 buildstate->gfbb->pagesPerBuffer =
 517                         calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
 518         }
 519
 520         /*
 521          * In 'auto' mode, check if the index has grown too large to fit in cache,
 522          * and switch to buffering mode if it has.
 523          *
 524          * To avoid excessive calls to smgrnblocks(), only check this every
 525          * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
 526          */
 527         if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
 528                  buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
 529                  effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
 530                 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
 531                  buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
 532         {
 533                 /*
 534                  * Index doesn't fit in effective cache anymore. Try to switch to
 535                  * buffering build mode.
 536                  */
 537                 gistInitBuffering(buildstate);
 538         }
 539 }
 540
 541 /*
 542  * Insert function for buffering index build.
 543  */
 544 static void
 545 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
 546 {
 547         /* Insert the tuple to buffers. */
 548         gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
 549
 550         /* If we filled up (half of a) buffer, process buffer emptying. */
 551         gistProcessEmptyingQueue(buildstate);
 552 }
 553
 554 /*
 555  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
 556  * page or node buffer, and inserts the tuple there. Returns true if we have
 557  * to stop buffer emptying process (because one of child buffers can't take
 558  * index tuples anymore).
 559  */
 560 static bool
 561 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
 562                                 BlockNumber startblkno, int startlevel)
 563 {
 564         GISTSTATE  *giststate = buildstate->giststate;
 565         GISTBuildBuffers *gfbb = buildstate->gfbb;
 566         Relation        indexrel = buildstate->indexrel;
 567         BlockNumber childblkno;
 568         Buffer          buffer;
 569         bool            result = false;
 570         BlockNumber blkno;
 571         int                     level;
 572         OffsetNumber downlinkoffnum = InvalidOffsetNumber;
 573         BlockNumber parentblkno = InvalidBlockNumber;
 574
 575         CHECK_FOR_INTERRUPTS();
 576
 577         /*
 578          * Loop until we reach a leaf page (level == 0) or a level with buffers
 579          * (not including the level we start at, because we would otherwise make
 580          * no progress).
 581          */
 582         blkno = startblkno;
 583         level = startlevel;
 584         for (;;)
 585         {
 586                 ItemId          iid;
 587                 IndexTuple      idxtuple,
 588                                         newtup;
 589                 Page            page;
 590                 OffsetNumber childoffnum;
 591
 592                 /* Have we reached a level with buffers? */
 593                 if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
 594                         break;
 595
 596                 /* Have we reached a leaf page? */
 597                 if (level == 0)
 598                         break;
 599
 600                 /*
 601                  * Nope. Descend down to the next level then. Choose a child to
 602                  * descend down to.
 603                  */
 604
 605                 buffer = ReadBuffer(indexrel, blkno);
 606                 LockBuffer(buffer, GIST_EXCLUSIVE);
 607
 608                 page = (Page) BufferGetPage(buffer);
 609                 childoffnum = gistchoose(indexrel, page, itup, giststate);
 610                 iid = PageGetItemId(page, childoffnum);
 611                 idxtuple = (IndexTuple) PageGetItem(page, iid);
 612                 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 613
 614                 if (level > 1)
 615                         gistMemorizeParent(buildstate, childblkno, blkno);
 616
 617                 /*
 618                  * Check that the key representing the target child node is consistent
 619                  * with the key we're inserting. Update it if it's not.
 620                  */
 621                 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
 622                 if (newtup)
 623                 {
 624                         gistbufferinginserttuples(buildstate, buffer, level,
 625                                                                           &newtup, 1, childoffnum,
 626                                                                         InvalidBlockNumber, InvalidOffsetNumber);
 627                         /* gistbufferinginserttuples() released the buffer */
 628                 }
 629                 else
 630                         UnlockReleaseBuffer(buffer);
 631
 632                 /* Descend to the child */
 633                 parentblkno = blkno;
 634                 blkno = childblkno;
 635                 downlinkoffnum = childoffnum;
 636                 Assert(level > 0);
 637                 level--;
 638         }
 639
 640         if (LEVEL_HAS_BUFFERS(level, gfbb))
 641         {
 642                 /*
 643                  * We've reached level with buffers. Place the index tuple to the
 644                  * buffer, and add the buffer to the emptying queue if it overflows.
 645                  */
 646                 GISTNodeBuffer *childNodeBuffer;
 647
 648                 /* Find the buffer or create a new one */
 649                 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
 650
 651                 /* Add index tuple to it */
 652                 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
 653
 654                 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
 655                         result = true;
 656         }
 657         else
 658         {
 659                 /*
 660                  * We've reached a leaf page. Place the tuple here.
 661                  */
 662                 Assert(level == 0);
 663                 buffer = ReadBuffer(indexrel, blkno);
 664                 LockBuffer(buffer, GIST_EXCLUSIVE);
 665                 gistbufferinginserttuples(buildstate, buffer, level,
 666                                                                   &itup, 1, InvalidOffsetNumber,
 667                                                                   parentblkno, downlinkoffnum);
 668                 /* gistbufferinginserttuples() released the buffer */
 669         }
 670
 671         return result;
 672 }
 673
 674 /*
 675  * Insert tuples to a given page.
 676  *
 677  * This is analogous with gistinserttuples() in the regular insertion code.
 678  *
 679  * Caller should hold a lock on 'buffer' on entry. This function will unlock
 680  * and unpin it.
 681  */
 682 static void
 683 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
 684                                                   IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
 685                                                   BlockNumber parentblk, OffsetNumber downlinkoffnum)
 686 {
 687         GISTBuildBuffers *gfbb = buildstate->gfbb;
 688         List       *splitinfo;
 689         bool            is_split;
 690
 691         is_split = gistplacetopage(buildstate->indexrel,
 692                                                            buildstate->freespace,
 693                                                            buildstate->giststate,
 694                                                            buffer,
 695                                                            itup, ntup, oldoffnum,
 696                                                            InvalidBuffer,
 697                                                            &splitinfo,
 698                                                            false);
 699
 700         /*
 701          * If this is a root split, update the root path item kept in memory. This
 702          * ensures that all path stacks are always complete, including all parent
 703          * nodes up to the root. That simplifies the algorithm to re-find correct
 704          * parent.
 705          */
 706         if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
 707         {
 708                 Page            page = BufferGetPage(buffer);
 709                 OffsetNumber off;
 710                 OffsetNumber maxoff;
 711
 712                 Assert(level == gfbb->rootlevel);
 713                 gfbb->rootlevel++;
 714
 715                 elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
 716
 717                 /*
 718                  * All the downlinks on the old root page are now on one of the child
 719                  * pages. Visit all the new child pages to memorize the parents of the
 720                  * grandchildren.
 721                  */
 722                 if (gfbb->rootlevel > 1)
 723                 {
 724                         maxoff = PageGetMaxOffsetNumber(page);
 725                         for (off = FirstOffsetNumber; off <= maxoff; off++)
 726                         {
 727                                 ItemId          iid = PageGetItemId(page, off);
 728                                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
 729                                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 730                                 Buffer          childbuf = ReadBuffer(buildstate->indexrel, childblkno);
 731
 732                                 LockBuffer(childbuf, GIST_SHARE);
 733                                 gistMemorizeAllDownlinks(buildstate, childbuf);
 734                                 UnlockReleaseBuffer(childbuf);
 735
 736                                 /*
 737                                  * Also remember that the parent of the new child page is the
 738                                  * root block.
 739                                  */
 740                                 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
 741                         }
 742                 }
 743         }
 744
 745         if (splitinfo)
 746         {
 747                 /*
 748                  * Insert the downlinks to the parent. This is analogous with
 749                  * gistfinishsplit() in the regular insertion code, but the locking is
 750                  * simpler, and we have to maintain the buffers on internal nodes and
 751                  * the parent map.
 752                  */
 753                 IndexTuple *downlinks;
 754                 int                     ndownlinks,
 755                                         i;
 756                 Buffer          parentBuffer;
 757                 ListCell   *lc;
 758
 759                 /* Parent may have changed since we memorized this path. */
 760                 parentBuffer =
 761                         gistBufferingFindCorrectParent(buildstate,
 762                                                                                    BufferGetBlockNumber(buffer),
 763                                                                                    level,
 764                                                                                    &parentblk,
 765                                                                                    &downlinkoffnum);
 766
 767                 /*
 768                  * If there's a buffer associated with this page, that needs to be
 769                  * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
 770                  * downlinks in 'splitinfo', to make sure they're consistent not only
 771                  * with the tuples already on the pages, but also the tuples in the
 772                  * buffers that will eventually be inserted to them.
 773                  */
 774                 gistRelocateBuildBuffersOnSplit(gfbb,
 775                                                                                 buildstate->giststate,
 776                                                                                 buildstate->indexrel,
 777                                                                                 level,
 778                                                                                 buffer, splitinfo);
 779
 780                 /* Create an array of all the downlink tuples */
 781                 ndownlinks = list_length(splitinfo);
 782                 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
 783                 i = 0;
 784                 foreach(lc, splitinfo)
 785                 {
 786                         GISTPageSplitInfo *splitinfo = lfirst(lc);
 787
 788                         /*
 789                          * Remember the parent of each new child page in our parent map.
 790                          * This assumes that the downlinks fit on the parent page. If the
 791                          * parent page is split, too, when we recurse up to insert the
 792                          * downlinks, the recursive gistbufferinginserttuples() call will
 793                          * update the map again.
 794                          */
 795                         if (level > 0)
 796                                 gistMemorizeParent(buildstate,
 797                                                                    BufferGetBlockNumber(splitinfo->buf),
 798                                                                    BufferGetBlockNumber(parentBuffer));
 799
 800                         /*
 801                          * Also update the parent map for all the downlinks that got moved
 802                          * to a different page. (actually this also loops through the
 803                          * downlinks that stayed on the original page, but it does no
 804                          * harm).
 805                          */
 806                         if (level > 1)
 807                                 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
 808
 809                         /*
 810                          * Since there's no concurrent access, we can release the lower
 811                          * level buffers immediately. This includes the original page.
 812                          */
 813                         UnlockReleaseBuffer(splitinfo->buf);
 814                         downlinks[i++] = splitinfo->downlink;
 815                 }
 816
 817                 /* Insert them into parent. */
 818                 gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
 819                                                                   downlinks, ndownlinks, downlinkoffnum,
 820                                                                   InvalidBlockNumber, InvalidOffsetNumber);
 821
 822                 list_free_deep(splitinfo);              /* we don't need this anymore */
 823         }
 824         else
 825                 UnlockReleaseBuffer(buffer);
 826 }
 827
 828 /*
 829  * Find the downlink pointing to a child page.
 830  *
 831  * 'childblkno' indicates the child page to find the parent for. 'level' is
 832  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
 833  * point to a location where the downlink used to be - we will check that
 834  * location first, and save some cycles if it hasn't moved. The function
 835  * returns a buffer containing the downlink, exclusively-locked, and
 836  * *parentblkno and *downlinkoffnum are set to the real location of the
 837  * downlink.
 838  *
 839  * If the child page is a leaf (level == 0), the caller must supply a correct
 840  * parentblkno. Otherwise we use the parent map hash table to find the parent
 841  * block.
 842  *
 843  * This function serves the same purpose as gistFindCorrectParent() during
 844  * normal index inserts, but this is simpler because we don't need to deal
 845  * with concurrent inserts.
 846  */
 847 static Buffer
 848 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
 849                                                            BlockNumber childblkno, int level,
 850                                                            BlockNumber *parentblkno,
 851                                                            OffsetNumber *downlinkoffnum)
 852 {
 853         BlockNumber parent;
 854         Buffer          buffer;
 855         Page            page;
 856         OffsetNumber maxoff;
 857         OffsetNumber off;
 858
 859         if (level > 0)
 860                 parent = gistGetParent(buildstate, childblkno);
 861         else
 862         {
 863                 /*
 864                  * For a leaf page, the caller must supply a correct parent block
 865                  * number.
 866                  */
 867                 if (*parentblkno == InvalidBlockNumber)
 868                         elog(ERROR, "no parent buffer provided of child %d", childblkno);
 869                 parent = *parentblkno;
 870         }
 871
 872         buffer = ReadBuffer(buildstate->indexrel, parent);
 873         page = BufferGetPage(buffer);
 874         LockBuffer(buffer, GIST_EXCLUSIVE);
 875         gistcheckpage(buildstate->indexrel, buffer);
 876         maxoff = PageGetMaxOffsetNumber(page);
 877
 878         /* Check if it was not moved */
 879         if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
 880                 *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
 881         {
 882                 ItemId          iid = PageGetItemId(page, *downlinkoffnum);
 883                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
 884
 885                 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
 886                 {
 887                         /* Still there */
 888                         return buffer;
 889                 }
 890         }
 891
 892         /*
 893          * Downlink was not at the offset where it used to be. Scan the page to
 894          * find it. During normal gist insertions, it might've moved to another
 895          * page, to the right, but during a buffering build, we keep track of the
 896          * parent of each page in the lookup table so we should always know what
 897          * page it's on.
 898          */
 899         for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
 900         {
 901                 ItemId          iid = PageGetItemId(page, off);
 902                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
 903
 904                 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
 905                 {
 906                         /* yes!!, found it */
 907                         *downlinkoffnum = off;
 908                         return buffer;
 909                 }
 910         }
 911
 912         elog(ERROR, "failed to re-find parent for block %u", childblkno);
 913         return InvalidBuffer;           /* keep compiler quiet */
 914 }
 915
 916 /*
 917  * Process buffers emptying stack. Emptying of one buffer can cause emptying
 918  * of other buffers. This function iterates until this cascading emptying
 919  * process finished, e.g. until buffers emptying stack is empty.
 920  */
 921 static void
 922 gistProcessEmptyingQueue(GISTBuildState *buildstate)
 923 {
 924         GISTBuildBuffers *gfbb = buildstate->gfbb;
 925
 926         /* Iterate while we have elements in buffers emptying stack. */
 927         while (gfbb->bufferEmptyingQueue != NIL)
 928         {
 929                 GISTNodeBuffer *emptyingNodeBuffer;
 930
 931                 /* Get node buffer from emptying stack. */
 932                 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
 933                 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
 934                 emptyingNodeBuffer->queuedForEmptying = false;
 935
 936                 /*
 937                  * We are going to load last pages of buffers where emptying will be
 938                  * to. So let's unload any previously loaded buffers.
 939                  */
 940                 gistUnloadNodeBuffers(gfbb);
 941
 942                 /*
 943                  * Pop tuples from the buffer and run them down to the buffers at
 944                  * lower level, or leaf pages. We continue until one of the lower
 945                  * level buffers fills up, or this buffer runs empty.
 946                  *
 947                  * In Arge et al's paper, the buffer emptying is stopped after
 948                  * processing 1/2 node buffer worth of tuples, to avoid overfilling
 949                  * any of the lower level buffers. However, it's more efficient to
 950                  * keep going until one of the lower level buffers actually fills up,
 951                  * so that's what we do. This doesn't need to be exact, if a buffer
 952                  * overfills by a few tuples, there's no harm done.
 953                  */
 954                 while (true)
 955                 {
 956                         IndexTuple      itup;
 957
 958                         /* Get next index tuple from the buffer */
 959                         if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
 960                                 break;
 961
 962                         /*
 963                          * Run it down to the underlying node buffer or leaf page.
 964                          *
 965                          * Note: it's possible that the buffer we're emptying splits as a
 966                          * result of this call. If that happens, our emptyingNodeBuffer
 967                          * points to the left half of the split. After split, it's very
 968                          * likely that the new left buffer is no longer over the half-full
 969                          * threshold, but we might as well keep flushing tuples from it
 970                          * until we fill a lower-level buffer.
 971                          */
 972                         if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
 973                         {
 974                                 /*
 975                                  * A lower level buffer filled up. Stop emptying this buffer,
 976                                  * to avoid overflowing the lower level buffer.
 977                                  */
 978                                 break;
 979                         }
 980
 981                         /* Free all the memory allocated during index tuple processing */
 982                         MemoryContextReset(buildstate->giststate->tempCxt);
 983                 }
 984         }
 985 }
 986
 987 /*
 988  * Empty all node buffers, from top to bottom. This is done at the end of
 989  * index build to flush all remaining tuples to the index.
 990  *
 991  * Note: This destroys the buffersOnLevels lists, so the buffers should not
 992  * be inserted to after this call.
 993  */
 994 static void
 995 gistEmptyAllBuffers(GISTBuildState *buildstate)
 996 {
 997         GISTBuildBuffers *gfbb = buildstate->gfbb;
 998         MemoryContext oldCtx;
 999         int                     i;
1000
1001         oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1002
1003         /*
1004          * Iterate through the levels from top to bottom.
1005          */
1006         for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1007         {
1008                 /*
1009                  * Empty all buffers on this level. Note that new buffers can pop up
1010                  * in the list during the processing, as a result of page splits, so a
1011                  * simple walk through the list won't work. We remove buffers from the
1012                  * list when we see them empty; a buffer can't become non-empty once
1013                  * it's been fully emptied.
1014                  */
1015                 while (gfbb->buffersOnLevels[i] != NIL)
1016                 {
1017                         GISTNodeBuffer *nodeBuffer;
1018
1019                         nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1020
1021                         if (nodeBuffer->blocksCount != 0)
1022                         {
1023                                 /*
1024                                  * Add this buffer to the emptying queue, and proceed to empty
1025                                  * the queue.
1026                                  */
1027                                 if (!nodeBuffer->queuedForEmptying)
1028                                 {
1029                                         MemoryContextSwitchTo(gfbb->context);
1030                                         nodeBuffer->queuedForEmptying = true;
1031                                         gfbb->bufferEmptyingQueue =
1032                                                 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1033                                         MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1034                                 }
1035                                 gistProcessEmptyingQueue(buildstate);
1036                         }
1037                         else
1038                                 gfbb->buffersOnLevels[i] =
1039                                         list_delete_first(gfbb->buffersOnLevels[i]);
1040                 }
1041                 elog(DEBUG2, "emptied all buffers at level %d", i);
1042         }
1043         MemoryContextSwitchTo(oldCtx);
1044 }
1045
1046 /*
1047  * Get the depth of the GiST index.
1048  */
1049 static int
1050 gistGetMaxLevel(Relation index)
1051 {
1052         int                     maxLevel;
1053         BlockNumber blkno;
1054
1055         /*
1056          * Traverse down the tree, starting from the root, until we hit the leaf
1057          * level.
1058          */
1059         maxLevel = 0;
1060         blkno = GIST_ROOT_BLKNO;
1061         while (true)
1062         {
1063                 Buffer          buffer;
1064                 Page            page;
1065                 IndexTuple      itup;
1066
1067                 buffer = ReadBuffer(index, blkno);
1068
1069                 /*
1070                  * There's no concurrent access during index build, so locking is just
1071                  * pro forma.
1072                  */
1073                 LockBuffer(buffer, GIST_SHARE);
1074                 page = (Page) BufferGetPage(buffer);
1075
1076                 if (GistPageIsLeaf(page))
1077                 {
1078                         /* We hit the bottom, so we're done. */
1079                         UnlockReleaseBuffer(buffer);
1080                         break;
1081                 }
1082
1083                 /*
1084                  * Pick the first downlink on the page, and follow it. It doesn't
1085                  * matter which downlink we choose, the tree has the same depth
1086                  * everywhere, so we just pick the first one.
1087                  */
1088                 itup = (IndexTuple) PageGetItem(page,
1089                                                                          PageGetItemId(page, FirstOffsetNumber));
1090                 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1091                 UnlockReleaseBuffer(buffer);
1092
1093                 /*
1094                  * We're going down on the tree. It means that there is yet one more
1095                  * level in the tree.
1096                  */
1097                 maxLevel++;
1098         }
1099         return maxLevel;
1100 }
1101
1102
1103 /*
1104  * Routines for managing the parent map.
1105  *
1106  * Whenever a page is split, we need to insert the downlinks into the parent.
1107  * We need to somehow find the parent page to do that. In normal insertions,
1108  * we keep a stack of nodes visited when we descend the tree. However, in
1109  * buffering build, we can start descending the tree from any internal node,
1110  * when we empty a buffer by cascading tuples to its children. So we don't
1111  * have a full stack up to the root available at that time.
1112  *
1113  * So instead, we maintain a hash table to track the parent of every internal
1114  * page. We don't need to track the parents of leaf nodes, however. Whenever
1115  * we insert to a leaf, we've just descended down from its parent, so we know
1116  * its immediate parent already. This helps a lot to limit the memory used
1117  * by this hash table.
1118  *
1119  * Whenever an internal node is split, the parent map needs to be updated.
1120  * the parent of the new child page needs to be recorded, and also the
1121  * entries for all page whose downlinks are moved to a new page at the split
1122  * needs to be updated.
1123  *
1124  * We also update the parent map whenever we descend the tree. That might seem
1125  * unnecessary, because we maintain the map whenever a downlink is moved or
1126  * created, but it is needed because we switch to buffering mode after
1127  * creating a tree with regular index inserts. Any pages created before
1128  * switching to buffering mode will not be present in the parent map initially,
1129  * but will be added there the first time we visit them.
1130  */
1131
1132 typedef struct
1133 {
1134         BlockNumber childblkno;         /* hash key */
1135         BlockNumber parentblkno;
1136 } ParentMapEntry;
1137
1138 static void
1139 gistInitParentMap(GISTBuildState *buildstate)
1140 {
1141         HASHCTL         hashCtl;
1142
1143         hashCtl.keysize = sizeof(BlockNumber);
1144         hashCtl.entrysize = sizeof(ParentMapEntry);
1145         hashCtl.hcxt = CurrentMemoryContext;
1146         hashCtl.hash = oid_hash;
1147         buildstate->parentMap = hash_create("gistbuild parent map",
1148                                                                                 1024,
1149                                                                                 &hashCtl,
1150                                                                                 HASH_ELEM | HASH_CONTEXT
1151                                                                                 | HASH_FUNCTION);
1152 }
1153
1154 static void
1155 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1156 {
1157         ParentMapEntry *entry;
1158         bool            found;
1159
1160         entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1161                                                                                    (const void *) &child,
1162                                                                                    HASH_ENTER,
1163                                                                                    &found);
1164         entry->parentblkno = parent;
1165 }
1166
1167 /*
1168  * Scan all downlinks on a page, and memorize their parent.
1169  */
1170 static void
1171 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1172 {
1173         OffsetNumber maxoff;
1174         OffsetNumber off;
1175         BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1176         Page            page = BufferGetPage(parentbuf);
1177
1178         Assert(!GistPageIsLeaf(page));
1179
1180         maxoff = PageGetMaxOffsetNumber(page);
1181         for (off = FirstOffsetNumber; off <= maxoff; off++)
1182         {
1183                 ItemId          iid = PageGetItemId(page, off);
1184                 IndexTuple      idxtuple = (IndexTuple) PageGetItem(page, iid);
1185                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1186
1187                 gistMemorizeParent(buildstate, childblkno, parentblkno);
1188         }
1189 }
1190
1191 static BlockNumber
1192 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1193 {
1194         ParentMapEntry *entry;
1195         bool            found;
1196
1197         /* Find node buffer in hash table */
1198         entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1199                                                                                    (const void *) &child,
1200                                                                                    HASH_FIND,
1201                                                                                    &found);
1202         if (!found)
1203                 elog(ERROR, "could not find parent of block %d in lookup table", child);
1204
1205         return entry->parentblkno;
1206 }