]> granicus.if.org Git - postgresql/blob - src/backend/access/gin/gindatapage.c
Reduce pinning and buffer content locking for btree scans.
[postgresql] / src / backend / access / gin / gindatapage.c
1 /*-------------------------------------------------------------------------
2  *
3  * gindatapage.c
4  *        routines for handling GIN posting tree pages.
5  *
6  *
7  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *                      src/backend/access/gin/gindatapage.c
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "access/gin_private.h"
18 #include "access/xloginsert.h"
19 #include "lib/ilist.h"
20 #include "miscadmin.h"
21 #include "utils/memutils.h"
22 #include "utils/rel.h"
23
24 /*
25  * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
26  *
27  * The code can deal with any size, but random access is more efficient when
28  * a number of smaller lists are stored, rather than one big list. If a
29  * posting list would become larger than Max size as a result of insertions,
30  * it is split into two. If a posting list would be smaller than minimum
31  * size, it is merged with the next posting list.
32  */
33 #define GinPostingListSegmentMaxSize 384
34 #define GinPostingListSegmentTargetSize 256
35 #define GinPostingListSegmentMinSize 128
36
37 /*
38  * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
39  * long segment. This is used when estimating how much space is required
40  * for N items, at minimum.
41  */
42 #define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6)
43
44 /*
45  * A working struct for manipulating a posting tree leaf page.
46  */
47 typedef struct
48 {
49         dlist_head      segments;               /* a list of leafSegmentInfos */
50
51         /*
52          * The following fields represent how the segments are split across pages,
53          * if a page split is required. Filled in by leafRepackItems.
54          */
55         dlist_node *lastleft;           /* last segment on left page */
56         int                     lsize;                  /* total size on left page */
57         int                     rsize;                  /* total size on right page */
58
59         bool            oldformat;              /* page is in pre-9.4 format on disk */
60 } disassembledLeaf;
61
62 typedef struct
63 {
64         dlist_node      node;                   /* linked list pointers */
65
66         /*-------------
67          * 'action' indicates the status of this in-memory segment, compared to
68          * what's on disk. It is one of the GIN_SEGMENT_* action codes:
69          *
70          * UNMODIFIED   no changes
71          * DELETE               the segment is to be removed. 'seg' and 'items' are
72          *                              ignored
73          * INSERT               this is a completely new segment
74          * REPLACE              this replaces an existing segment with new content
75          * ADDITEMS             like REPLACE, but no items have been removed, and we track
76          *                              in detail what items have been added to this segment, in
77          *                              'modifieditems'
78          *-------------
79          */
80         char            action;
81
82         ItemPointerData *modifieditems;
83         int                     nmodifieditems;
84
85         /*
86          * The following fields represent the items in this segment. If 'items' is
87          * not NULL, it contains a palloc'd array of the itemsin this segment. If
88          * 'seg' is not NULL, it contains the items in an already-compressed
89          * format. It can point to an on-disk page (!modified), or a palloc'd
90          * segment in memory. If both are set, they must represent the same items.
91          */
92         GinPostingList *seg;
93         ItemPointer items;
94         int                     nitems;                 /* # of items in 'items', if items != NULL */
95 } leafSegmentInfo;
96
97 static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
98 static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
99                                           GinBtreeStack *stack,
100                                           void *insertdata, BlockNumber updateblkno,
101                                           Page *newlpage, Page *newrpage);
102
103 static disassembledLeaf *disassembleLeaf(Page page);
104 static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
105 static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
106                            int nNewItems);
107
108 static void registerLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf);
109 static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
110 static void dataPlaceToPageLeafSplit(Buffer buf,
111                                                  disassembledLeaf *leaf,
112                                                  ItemPointerData lbound, ItemPointerData rbound,
113                                                  Page lpage, Page rpage);
114
115 /*
116  * Read TIDs from leaf data page to single uncompressed array. The TIDs are
117  * returned in ascending order.
118  *
119  * advancePast is a hint, indicating that the caller is only interested in
120  * TIDs > advancePast. To return all items, use ItemPointerSetMin.
121  *
122  * Note: This function can still return items smaller than advancePast that
123  * are in the same posting list as the items of interest, so the caller must
124  * still check all the returned items. But passing it allows this function to
125  * skip whole posting lists.
126  */
127 ItemPointer
128 GinDataLeafPageGetItems(Page page, int *nitems, ItemPointerData advancePast)
129 {
130         ItemPointer result;
131
132         if (GinPageIsCompressed(page))
133         {
134                 GinPostingList *seg = GinDataLeafPageGetPostingList(page);
135                 Size            len = GinDataLeafPageGetPostingListSize(page);
136                 Pointer         endptr = ((Pointer) seg) + len;
137                 GinPostingList *next;
138
139                 /* Skip to the segment containing advancePast+1 */
140                 if (ItemPointerIsValid(&advancePast))
141                 {
142                         next = GinNextPostingListSegment(seg);
143                         while ((Pointer) next < endptr &&
144                                    ginCompareItemPointers(&next->first, &advancePast) <= 0)
145                         {
146                                 seg = next;
147                                 next = GinNextPostingListSegment(seg);
148                         }
149                         len = endptr - (Pointer) seg;
150                 }
151
152                 if (len > 0)
153                         result = ginPostingListDecodeAllSegments(seg, len, nitems);
154                 else
155                 {
156                         result = NULL;
157                         *nitems = 0;
158                 }
159         }
160         else
161         {
162                 ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems);
163
164                 result = palloc((*nitems) * sizeof(ItemPointerData));
165                 memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData));
166         }
167
168         return result;
169 }
170
171 /*
172  * Places all TIDs from leaf data page to bitmap.
173  */
174 int
175 GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm)
176 {
177         ItemPointer uncompressed;
178         int                     nitems;
179
180         if (GinPageIsCompressed(page))
181         {
182                 GinPostingList *segment = GinDataLeafPageGetPostingList(page);
183                 Size            len = GinDataLeafPageGetPostingListSize(page);
184
185                 nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm);
186         }
187         else
188         {
189                 uncompressed = dataLeafPageGetUncompressed(page, &nitems);
190
191                 if (nitems > 0)
192                         tbm_add_tuples(tbm, uncompressed, nitems, false);
193         }
194
195         return nitems;
196 }
197
198 /*
199  * Get pointer to the uncompressed array of items on a pre-9.4 format
200  * uncompressed leaf page. The number of items in the array is returned in
201  * *nitems.
202  */
203 static ItemPointer
204 dataLeafPageGetUncompressed(Page page, int *nitems)
205 {
206         ItemPointer items;
207
208         Assert(!GinPageIsCompressed(page));
209
210         /*
211          * In the old pre-9.4 page format, the whole page content is used for
212          * uncompressed items, and the number of items is stored in 'maxoff'
213          */
214         items = (ItemPointer) GinDataPageGetData(page);
215         *nitems = GinPageGetOpaque(page)->maxoff;
216
217         return items;
218 }
219
220 /*
221  * Check if we should follow the right link to find the item we're searching
222  * for.
223  *
224  * Compares inserting item pointer with the right bound of the current page.
225  */
226 static bool
227 dataIsMoveRight(GinBtree btree, Page page)
228 {
229         ItemPointer iptr = GinDataPageGetRightBound(page);
230
231         if (GinPageRightMost(page))
232                 return FALSE;
233
234         return (ginCompareItemPointers(&btree->itemptr, iptr) > 0) ? TRUE : FALSE;
235 }
236
237 /*
238  * Find correct PostingItem in non-leaf page. It is assumed that this is
239  * the correct page, and the searched value SHOULD be on the page.
240  */
241 static BlockNumber
242 dataLocateItem(GinBtree btree, GinBtreeStack *stack)
243 {
244         OffsetNumber low,
245                                 high,
246                                 maxoff;
247         PostingItem *pitem = NULL;
248         int                     result;
249         Page            page = BufferGetPage(stack->buffer);
250
251         Assert(!GinPageIsLeaf(page));
252         Assert(GinPageIsData(page));
253
254         if (btree->fullScan)
255         {
256                 stack->off = FirstOffsetNumber;
257                 stack->predictNumber *= GinPageGetOpaque(page)->maxoff;
258                 return btree->getLeftMostChild(btree, page);
259         }
260
261         low = FirstOffsetNumber;
262         maxoff = high = GinPageGetOpaque(page)->maxoff;
263         Assert(high >= low);
264
265         high++;
266
267         while (high > low)
268         {
269                 OffsetNumber mid = low + ((high - low) / 2);
270
271                 pitem = GinDataPageGetPostingItem(page, mid);
272
273                 if (mid == maxoff)
274                 {
275                         /*
276                          * Right infinity, page already correctly chosen with a help of
277                          * dataIsMoveRight
278                          */
279                         result = -1;
280                 }
281                 else
282                 {
283                         pitem = GinDataPageGetPostingItem(page, mid);
284                         result = ginCompareItemPointers(&btree->itemptr, &(pitem->key));
285                 }
286
287                 if (result == 0)
288                 {
289                         stack->off = mid;
290                         return PostingItemGetBlockNumber(pitem);
291                 }
292                 else if (result > 0)
293                         low = mid + 1;
294                 else
295                         high = mid;
296         }
297
298         Assert(high >= FirstOffsetNumber && high <= maxoff);
299
300         stack->off = high;
301         pitem = GinDataPageGetPostingItem(page, high);
302         return PostingItemGetBlockNumber(pitem);
303 }
304
305 /*
306  * Find link to blkno on non-leaf page, returns offset of PostingItem
307  */
308 static OffsetNumber
309 dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
310 {
311         OffsetNumber i,
312                                 maxoff = GinPageGetOpaque(page)->maxoff;
313         PostingItem *pitem;
314
315         Assert(!GinPageIsLeaf(page));
316         Assert(GinPageIsData(page));
317
318         /* if page isn't changed, we return storedOff */
319         if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
320         {
321                 pitem = GinDataPageGetPostingItem(page, storedOff);
322                 if (PostingItemGetBlockNumber(pitem) == blkno)
323                         return storedOff;
324
325                 /*
326                  * we hope, that needed pointer goes to right. It's true if there
327                  * wasn't a deletion
328                  */
329                 for (i = storedOff + 1; i <= maxoff; i++)
330                 {
331                         pitem = GinDataPageGetPostingItem(page, i);
332                         if (PostingItemGetBlockNumber(pitem) == blkno)
333                                 return i;
334                 }
335
336                 maxoff = storedOff - 1;
337         }
338
339         /* last chance */
340         for (i = FirstOffsetNumber; i <= maxoff; i++)
341         {
342                 pitem = GinDataPageGetPostingItem(page, i);
343                 if (PostingItemGetBlockNumber(pitem) == blkno)
344                         return i;
345         }
346
347         return InvalidOffsetNumber;
348 }
349
350 /*
351  * Return blkno of leftmost child
352  */
353 static BlockNumber
354 dataGetLeftMostPage(GinBtree btree, Page page)
355 {
356         PostingItem *pitem;
357
358         Assert(!GinPageIsLeaf(page));
359         Assert(GinPageIsData(page));
360         Assert(GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber);
361
362         pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
363         return PostingItemGetBlockNumber(pitem);
364 }
365
366 /*
367  * Add PostingItem to a non-leaf page.
368  */
369 void
370 GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
371 {
372         OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
373         char       *ptr;
374
375         Assert(PostingItemGetBlockNumber(data) != InvalidBlockNumber);
376         Assert(!GinPageIsLeaf(page));
377
378         if (offset == InvalidOffsetNumber)
379         {
380                 ptr = (char *) GinDataPageGetPostingItem(page, maxoff + 1);
381         }
382         else
383         {
384                 ptr = (char *) GinDataPageGetPostingItem(page, offset);
385                 if (offset != maxoff + 1)
386                         memmove(ptr + sizeof(PostingItem),
387                                         ptr,
388                                         (maxoff - offset + 1) *sizeof(PostingItem));
389         }
390         memcpy(ptr, data, sizeof(PostingItem));
391
392         maxoff++;
393         GinPageGetOpaque(page)->maxoff = maxoff;
394
395         /*
396          * Also set pd_lower to the end of the posting items, to follow the
397          * "standard" page layout, so that we can squeeze out the unused space
398          * from full-page images.
399          */
400         GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
401 }
402
403 /*
404  * Delete posting item from non-leaf page
405  */
406 void
407 GinPageDeletePostingItem(Page page, OffsetNumber offset)
408 {
409         OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
410
411         Assert(!GinPageIsLeaf(page));
412         Assert(offset >= FirstOffsetNumber && offset <= maxoff);
413
414         if (offset != maxoff)
415                 memmove(GinDataPageGetPostingItem(page, offset),
416                                 GinDataPageGetPostingItem(page, offset + 1),
417                                 sizeof(PostingItem) * (maxoff - offset));
418
419         maxoff--;
420         GinPageGetOpaque(page)->maxoff = maxoff;
421
422         GinDataPageSetDataSize(page, maxoff * sizeof(PostingItem));
423 }
424
425 /*
426  * Places keys to leaf data page and fills WAL record.
427  */
428 static GinPlaceToPageRC
429 dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
430                                         void *insertdata, Page *newlpage, Page *newrpage)
431 {
432         GinBtreeDataLeafInsertData *items = insertdata;
433         ItemPointer newItems = &items->items[items->curitem];
434         int                     maxitems = items->nitem - items->curitem;
435         Page            page = BufferGetPage(buf);
436         int                     i;
437         ItemPointerData rbound;
438         ItemPointerData lbound;
439         bool            needsplit;
440         bool            append;
441         int                     segsize;
442         Size            freespace;
443         MemoryContext tmpCxt;
444         MemoryContext oldCxt;
445         disassembledLeaf *leaf;
446         leafSegmentInfo *lastleftinfo;
447         ItemPointerData maxOldItem;
448         ItemPointerData remaining;
449
450         Assert(GinPageIsData(page));
451
452         rbound = *GinDataPageGetRightBound(page);
453
454         /*
455          * Count how many of the new items belong to this page.
456          */
457         if (!GinPageRightMost(page))
458         {
459                 for (i = 0; i < maxitems; i++)
460                 {
461                         if (ginCompareItemPointers(&newItems[i], &rbound) > 0)
462                         {
463                                 /*
464                                  * This needs to go to some other location in the tree. (The
465                                  * caller should've chosen the insert location so that at
466                                  * least the first item goes here.)
467                                  */
468                                 Assert(i > 0);
469                                 break;
470                         }
471                 }
472                 maxitems = i;
473         }
474
475         /*
476          * The following operations do quite a lot of small memory allocations,
477          * create a temporary memory context so that we don't need to keep track
478          * of them individually.
479          */
480         tmpCxt = AllocSetContextCreate(CurrentMemoryContext,
481                                                                    "Gin split temporary context",
482                                                                    ALLOCSET_DEFAULT_MINSIZE,
483                                                                    ALLOCSET_DEFAULT_INITSIZE,
484                                                                    ALLOCSET_DEFAULT_MAXSIZE);
485         oldCxt = MemoryContextSwitchTo(tmpCxt);
486
487         leaf = disassembleLeaf(page);
488
489         /*
490          * Are we appending to the end of the page? IOW, are all the new items
491          * larger than any of the existing items.
492          */
493         if (!dlist_is_empty(&leaf->segments))
494         {
495                 lastleftinfo = dlist_container(leafSegmentInfo, node,
496                                                                            dlist_tail_node(&leaf->segments));
497                 if (!lastleftinfo->items)
498                         lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
499                                                                                                            &lastleftinfo->nitems);
500                 maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1];
501                 if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0)
502                         append = true;
503                 else
504                         append = false;
505         }
506         else
507         {
508                 ItemPointerSetMin(&maxOldItem);
509                 append = true;
510         }
511
512         /*
513          * If we're appending to the end of the page, we will append as many items
514          * as we can fit (after splitting), and stop when the pages becomes full.
515          * Otherwise we have to limit the number of new items to insert, because
516          * once we start packing we can't just stop when we run out of space,
517          * because we must make sure that all the old items still fit.
518          */
519         if (GinPageIsCompressed(page))
520                 freespace = GinDataLeafPageGetFreeSpace(page);
521         else
522                 freespace = 0;
523         if (append)
524         {
525                 /*
526                  * Even when appending, trying to append more items than will fit is
527                  * not completely free, because we will merge the new items and old
528                  * items into an array below. In the best case, every new item fits in
529                  * a single byte, and we can use all the free space on the old page as
530                  * well as the new page. For simplicity, ignore segment overhead etc.
531                  */
532                 maxitems = Min(maxitems, freespace + GinDataPageMaxDataSize);
533         }
534         else
535         {
536                 /*
537                  * Calculate a conservative estimate of how many new items we can fit
538                  * on the two pages after splitting.
539                  *
540                  * We can use any remaining free space on the old page to store full
541                  * segments, as well as the new page. Each full-sized segment can hold
542                  * at least MinTuplesPerSegment items
543                  */
544                 int                     nnewsegments;
545
546                 nnewsegments = freespace / GinPostingListSegmentMaxSize;
547                 nnewsegments += GinDataPageMaxDataSize / GinPostingListSegmentMaxSize;
548                 maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment);
549         }
550
551         /* Add the new items to the segments */
552         if (!addItemsToLeaf(leaf, newItems, maxitems))
553         {
554                 /* all items were duplicates, we have nothing to do */
555                 items->curitem += maxitems;
556
557                 MemoryContextSwitchTo(oldCxt);
558                 MemoryContextDelete(tmpCxt);
559
560                 return UNMODIFIED;
561         }
562
563         /*
564          * Pack the items back to compressed segments, ready for writing to disk.
565          */
566         needsplit = leafRepackItems(leaf, &remaining);
567
568         /*
569          * Did all the new items fit?
570          *
571          * If we're appending, it's OK if they didn't. But as a sanity check,
572          * verify that all the old items fit.
573          */
574         if (ItemPointerIsValid(&remaining))
575         {
576                 if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0)
577                         elog(ERROR, "could not split GIN page; all old items didn't fit");
578
579                 /* Count how many of the new items did fit. */
580                 for (i = 0; i < maxitems; i++)
581                 {
582                         if (ginCompareItemPointers(&newItems[i], &remaining) >= 0)
583                                 break;
584                 }
585                 if (i == 0)
586                         elog(ERROR, "could not split GIN page; no new items fit");
587                 maxitems = i;
588         }
589
590         if (!needsplit)
591         {
592                 /*
593                  * Great, all the items fit on a single page. Construct a WAL record
594                  * describing the changes we made, and write the segments back to the
595                  * page.
596                  *
597                  * Once we start modifying the page, there's no turning back. The
598                  * caller is responsible for calling END_CRIT_SECTION() after writing
599                  * the WAL record.
600                  */
601                 MemoryContextSwitchTo(oldCxt);
602                 if (RelationNeedsWAL(btree->index))
603                         registerLeafRecompressWALData(buf, leaf);
604                 START_CRIT_SECTION();
605                 dataPlaceToPageLeafRecompress(buf, leaf);
606
607                 if (append)
608                         elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
609                                  maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
610                                  items->nitem - items->curitem - maxitems);
611                 else
612                         elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)",
613                                  maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize,
614                                  items->nitem - items->curitem - maxitems);
615         }
616         else
617         {
618                 /*
619                  * Had to split.
620                  *
621                  * leafRepackItems already divided the segments between the left and
622                  * the right page. It filled the left page as full as possible, and
623                  * put the rest to the right page. When building a new index, that's
624                  * good, because the table is scanned from beginning to end and there
625                  * won't be any more insertions to the left page during the build.
626                  * This packs the index as tight as possible. But otherwise, split
627                  * 50/50, by moving segments from the left page to the right page
628                  * until they're balanced.
629                  *
630                  * As a further heuristic, when appending items to the end of the
631                  * page, try make the left page 75% full, one the assumption that
632                  * subsequent insertions will probably also go to the end. This packs
633                  * the index somewhat tighter when appending to a table, which is very
634                  * common.
635                  */
636                 if (!btree->isBuild)
637                 {
638                         while (dlist_has_prev(&leaf->segments, leaf->lastleft))
639                         {
640                                 lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
641
642                                 /* ignore deleted segments */
643                                 if (lastleftinfo->action != GIN_SEGMENT_DELETE)
644                                 {
645                                         segsize = SizeOfGinPostingList(lastleftinfo->seg);
646
647                                         /*
648                                          * Note that we check that the right page doesn't become
649                                          * more full than the left page even when appending. It's
650                                          * possible that we added enough items to make both pages
651                                          * more than 75% full.
652                                          */
653                                         if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0)
654                                                 break;
655                                         if (append)
656                                         {
657                                                 if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4)
658                                                         break;
659                                         }
660
661                                         leaf->lsize -= segsize;
662                                         leaf->rsize += segsize;
663                                 }
664                                 leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
665                         }
666                 }
667                 Assert(leaf->lsize <= GinDataPageMaxDataSize);
668                 Assert(leaf->rsize <= GinDataPageMaxDataSize);
669
670                 /*
671                  * Fetch the max item in the left page's last segment; it becomes the
672                  * right bound of the page.
673                  */
674                 lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft);
675                 if (!lastleftinfo->items)
676                         lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg,
677                                                                                                            &lastleftinfo->nitems);
678                 lbound = lastleftinfo->items[lastleftinfo->nitems - 1];
679
680                 *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ);
681                 *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ);
682
683                 dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound,
684                                                                  *newlpage, *newrpage);
685
686                 Assert(GinPageRightMost(page) ||
687                            ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
688                                                                    GinDataPageGetRightBound(*newrpage)) < 0);
689
690                 if (append)
691                         elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
692                                  maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
693                                  items->nitem - items->curitem - maxitems);
694                 else
695                         elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)",
696                                  maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize,
697                                  items->nitem - items->curitem - maxitems);
698         }
699
700         MemoryContextSwitchTo(oldCxt);
701         MemoryContextDelete(tmpCxt);
702
703         items->curitem += maxitems;
704
705         return needsplit ? SPLIT : INSERTED;
706 }
707
708 /*
709  * Vacuum a posting tree leaf page.
710  */
711 void
712 ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
713 {
714         Page            page = BufferGetPage(buffer);
715         disassembledLeaf *leaf;
716         bool            removedsomething = false;
717         dlist_iter      iter;
718
719         leaf = disassembleLeaf(page);
720
721         /* Vacuum each segment. */
722         dlist_foreach(iter, &leaf->segments)
723         {
724                 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
725                 int                     oldsegsize;
726                 ItemPointer cleaned;
727                 int                     ncleaned;
728
729                 if (!seginfo->items)
730                         seginfo->items = ginPostingListDecode(seginfo->seg,
731                                                                                                   &seginfo->nitems);
732                 if (seginfo->seg)
733                         oldsegsize = SizeOfGinPostingList(seginfo->seg);
734                 else
735                         oldsegsize = GinDataPageMaxDataSize;
736
737                 cleaned = ginVacuumItemPointers(gvs,
738                                                                                 seginfo->items,
739                                                                                 seginfo->nitems,
740                                                                                 &ncleaned);
741                 pfree(seginfo->items);
742                 seginfo->items = NULL;
743                 seginfo->nitems = 0;
744                 if (cleaned)
745                 {
746                         if (ncleaned > 0)
747                         {
748                                 int                     npacked;
749
750                                 seginfo->seg = ginCompressPostingList(cleaned,
751                                                                                                           ncleaned,
752                                                                                                           oldsegsize,
753                                                                                                           &npacked);
754                                 /* Removing an item never increases the size of the segment */
755                                 if (npacked != ncleaned)
756                                         elog(ERROR, "could not fit vacuumed posting list");
757                                 seginfo->action = GIN_SEGMENT_REPLACE;
758                         }
759                         else
760                         {
761                                 seginfo->seg = NULL;
762                                 seginfo->items = NULL;
763                                 seginfo->action = GIN_SEGMENT_DELETE;
764                         }
765                         seginfo->nitems = ncleaned;
766
767                         removedsomething = true;
768                 }
769         }
770
771         /*
772          * If we removed any items, reconstruct the page from the pieces.
773          *
774          * We don't try to re-encode the segments here, even though some of them
775          * might be really small now that we've removed some items from them. It
776          * seems like a waste of effort, as there isn't really any benefit from
777          * larger segments per se; larger segments only help to pack more items in
778          * the same space. We might as well delay doing that until the next
779          * insertion, which will need to re-encode at least part of the page
780          * anyway.
781          *
782          * Also note if the page was in uncompressed, pre-9.4 format before, it is
783          * now represented as one huge segment that contains all the items. It
784          * might make sense to split that, to speed up random access, but we don't
785          * bother. You'll have to REINDEX anyway if you want the full gain of the
786          * new tighter index format.
787          */
788         if (removedsomething)
789         {
790                 bool            modified;
791
792                 /*
793                  * Make sure we have a palloc'd copy of all segments, after the first
794                  * segment that is modified. (dataPlaceToPageLeafRecompress requires
795                  * this).
796                  */
797                 modified = false;
798                 dlist_foreach(iter, &leaf->segments)
799                 {
800                         leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
801                                                                                                            iter.cur);
802
803                         if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
804                                 modified = true;
805                         if (modified && seginfo->action != GIN_SEGMENT_DELETE)
806                         {
807                                 int                     segsize = SizeOfGinPostingList(seginfo->seg);
808                                 GinPostingList *tmp = (GinPostingList *) palloc(segsize);
809
810                                 memcpy(tmp, seginfo->seg, segsize);
811                                 seginfo->seg = tmp;
812                         }
813                 }
814
815                 if (RelationNeedsWAL(indexrel))
816                 {
817                         XLogBeginInsert();
818                         registerLeafRecompressWALData(buffer, leaf);
819                 }
820                 START_CRIT_SECTION();
821                 dataPlaceToPageLeafRecompress(buffer, leaf);
822
823                 MarkBufferDirty(buffer);
824
825                 if (RelationNeedsWAL(indexrel))
826                 {
827                         XLogRecPtr      recptr;
828
829                         recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE);
830                         PageSetLSN(page, recptr);
831                 }
832
833                 END_CRIT_SECTION();
834         }
835 }
836
837 /*
838  * Construct a ginxlogRecompressDataLeaf record representing the changes
839  * in *leaf.
840  */
841 static void
842 registerLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
843 {
844         int                     nmodified = 0;
845         char       *walbufbegin;
846         char       *walbufend;
847         dlist_iter      iter;
848         int                     segno;
849         ginxlogRecompressDataLeaf *recompress_xlog;
850
851         /* Count the modified segments */
852         dlist_foreach(iter, &leaf->segments)
853         {
854                 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
855                                                                                                    iter.cur);
856
857                 if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
858                         nmodified++;
859         }
860
861         walbufbegin =
862                 palloc(sizeof(ginxlogRecompressDataLeaf) +
863                            BLCKSZ +                     /* max size needed to hold the segment data */
864                            nmodified * 2        /* (segno + action) per action */
865                 );
866         walbufend = walbufbegin;
867
868         recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
869         walbufend += sizeof(ginxlogRecompressDataLeaf);
870
871         recompress_xlog->nactions = nmodified;
872
873         segno = 0;
874         dlist_foreach(iter, &leaf->segments)
875         {
876                 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
877                                                                                                    iter.cur);
878                 int                     segsize = 0;
879                 int                     datalen;
880                 uint8           action = seginfo->action;
881
882                 if (action == GIN_SEGMENT_UNMODIFIED)
883                 {
884                         segno++;
885                         continue;
886                 }
887
888                 if (action != GIN_SEGMENT_DELETE)
889                         segsize = SizeOfGinPostingList(seginfo->seg);
890
891                 /*
892                  * If storing the uncompressed list of added item pointers would take
893                  * more space than storing the compressed segment as is, do that
894                  * instead.
895                  */
896                 if (action == GIN_SEGMENT_ADDITEMS &&
897                         seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
898                 {
899                         action = GIN_SEGMENT_REPLACE;
900                 }
901
902                 *((uint8 *) (walbufend++)) = segno;
903                 *(walbufend++) = action;
904
905                 switch (action)
906                 {
907                         case GIN_SEGMENT_DELETE:
908                                 datalen = 0;
909                                 break;
910
911                         case GIN_SEGMENT_ADDITEMS:
912                                 datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
913                                 memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16));
914                                 memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
915                                 datalen += sizeof(uint16);
916                                 break;
917
918                         case GIN_SEGMENT_INSERT:
919                         case GIN_SEGMENT_REPLACE:
920                                 datalen = SHORTALIGN(segsize);
921                                 memcpy(walbufend, seginfo->seg, segsize);
922                                 break;
923
924                         default:
925                                 elog(ERROR, "unexpected GIN leaf action %d", action);
926                 }
927                 walbufend += datalen;
928
929                 if (action != GIN_SEGMENT_INSERT)
930                         segno++;
931         }
932
933
934         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
935         XLogRegisterBufData(0, walbufbegin, walbufend - walbufbegin);
936
937 }
938
939 /*
940  * Assemble a disassembled posting tree leaf page back to a buffer.
941  *
942  * *prdata is filled with WAL information about this operation. The caller
943  * is responsible for inserting to the WAL, along with any other information
944  * about the operation that triggered this recompression.
945  *
946  * NOTE: The segment pointers must not point directly to the same buffer,
947  * except for segments that have not been modified and whose preceding
948  * segments have not been modified either.
949  */
950 static void
951 dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
952 {
953         Page            page = BufferGetPage(buf);
954         char       *ptr;
955         int                     newsize;
956         bool            modified = false;
957         dlist_iter      iter;
958         int                     segsize;
959
960         /*
961          * If the page was in pre-9.4 format before, convert the header, and force
962          * all segments to be copied to the page whether they were modified or
963          * not.
964          */
965         if (!GinPageIsCompressed(page))
966         {
967                 Assert(leaf->oldformat);
968                 GinPageSetCompressed(page);
969                 GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
970                 modified = true;
971         }
972
973         ptr = (char *) GinDataLeafPageGetPostingList(page);
974         newsize = 0;
975         dlist_foreach(iter, &leaf->segments)
976         {
977                 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
978
979                 if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
980                         modified = true;
981
982                 if (seginfo->action != GIN_SEGMENT_DELETE)
983                 {
984                         segsize = SizeOfGinPostingList(seginfo->seg);
985
986                         if (modified)
987                                 memcpy(ptr, seginfo->seg, segsize);
988
989                         ptr += segsize;
990                         newsize += segsize;
991                 }
992         }
993
994         Assert(newsize <= GinDataPageMaxDataSize);
995         GinDataPageSetDataSize(page, newsize);
996 }
997
998 /*
999  * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf
1000  * segments to two pages instead of one.
1001  *
1002  * This is different from the non-split cases in that this does not modify
1003  * the original page directly, but to temporary in-memory copies of the new
1004  * left and right pages.
1005  */
1006 static void
1007 dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
1008                                                  ItemPointerData lbound, ItemPointerData rbound,
1009                                                  Page lpage, Page rpage)
1010 {
1011         char       *ptr;
1012         int                     segsize;
1013         int                     lsize;
1014         int                     rsize;
1015         dlist_node *node;
1016         dlist_node *firstright;
1017         leafSegmentInfo *seginfo;
1018
1019         /* Initialize temporary pages to hold the new left and right pages */
1020         GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1021         GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1022
1023         /*
1024          * Copy the segments that go to the left page.
1025          *
1026          * XXX: We should skip copying the unmodified part of the left page, like
1027          * we do when recompressing.
1028          */
1029         lsize = 0;
1030         ptr = (char *) GinDataLeafPageGetPostingList(lpage);
1031         firstright = dlist_next_node(&leaf->segments, leaf->lastleft);
1032         for (node = dlist_head_node(&leaf->segments);
1033                  node != firstright;
1034                  node = dlist_next_node(&leaf->segments, node))
1035         {
1036                 seginfo = dlist_container(leafSegmentInfo, node, node);
1037
1038                 if (seginfo->action != GIN_SEGMENT_DELETE)
1039                 {
1040                         segsize = SizeOfGinPostingList(seginfo->seg);
1041                         memcpy(ptr, seginfo->seg, segsize);
1042                         ptr += segsize;
1043                         lsize += segsize;
1044                 }
1045         }
1046         Assert(lsize == leaf->lsize);
1047         GinDataPageSetDataSize(lpage, lsize);
1048         *GinDataPageGetRightBound(lpage) = lbound;
1049
1050         /* Copy the segments that go to the right page */
1051         ptr = (char *) GinDataLeafPageGetPostingList(rpage);
1052         rsize = 0;
1053         for (node = firstright;
1054                  ;
1055                  node = dlist_next_node(&leaf->segments, node))
1056         {
1057                 seginfo = dlist_container(leafSegmentInfo, node, node);
1058
1059                 if (seginfo->action != GIN_SEGMENT_DELETE)
1060                 {
1061                         segsize = SizeOfGinPostingList(seginfo->seg);
1062                         memcpy(ptr, seginfo->seg, segsize);
1063                         ptr += segsize;
1064                         rsize += segsize;
1065                 }
1066
1067                 if (!dlist_has_next(&leaf->segments, node))
1068                         break;
1069         }
1070         Assert(rsize == leaf->rsize);
1071         GinDataPageSetDataSize(rpage, rsize);
1072         *GinDataPageGetRightBound(rpage) = rbound;
1073 }
1074
1075 /*
1076  * Place a PostingItem to page, and fill a WAL record.
1077  *
1078  * If the item doesn't fit, returns false without modifying the page.
1079  *
1080  * In addition to inserting the given item, the downlink of the existing item
1081  * at 'off' is updated to point to 'updateblkno'.
1082  *
1083  * On INSERTED, registers the buffer as buffer ID 0, with data.
1084  * On SPLIT, returns rdata that represents the split pages in *prdata.
1085  */
1086 static GinPlaceToPageRC
1087 dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1088                                                 void *insertdata, BlockNumber updateblkno,
1089                                                 Page *newlpage, Page *newrpage)
1090 {
1091         Page            page = BufferGetPage(buf);
1092         OffsetNumber off = stack->off;
1093         PostingItem *pitem;
1094
1095         /* this must be static so it can be returned to caller */
1096         static ginxlogInsertDataInternal data;
1097
1098         /* split if we have to */
1099         if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem))
1100         {
1101                 dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno,
1102                                                           newlpage, newrpage);
1103                 return SPLIT;
1104         }
1105
1106         Assert(GinPageIsData(page));
1107
1108         START_CRIT_SECTION();
1109
1110         /* Update existing downlink to point to next page (on internal page) */
1111         pitem = GinDataPageGetPostingItem(page, off);
1112         PostingItemSetBlockNumber(pitem, updateblkno);
1113
1114         /* Add new item */
1115         pitem = (PostingItem *) insertdata;
1116         GinDataPageAddPostingItem(page, pitem, off);
1117
1118         if (RelationNeedsWAL(btree->index))
1119         {
1120                 data.offset = off;
1121                 data.newitem = *pitem;
1122
1123                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1124                 XLogRegisterBufData(0, (char *) &data,
1125                                                         sizeof(ginxlogInsertDataInternal));
1126         }
1127
1128         return INSERTED;
1129 }
1130
1131 /*
1132  * Places an item (or items) to a posting tree. Calls relevant function of
1133  * internal of leaf page because they are handled very differently.
1134  */
1135 static GinPlaceToPageRC
1136 dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
1137                                 void *insertdata, BlockNumber updateblkno,
1138                                 Page *newlpage, Page *newrpage)
1139 {
1140         Page            page = BufferGetPage(buf);
1141
1142         Assert(GinPageIsData(page));
1143
1144         if (GinPageIsLeaf(page))
1145                 return dataPlaceToPageLeaf(btree, buf, stack, insertdata,
1146                                                                    newlpage, newrpage);
1147         else
1148                 return dataPlaceToPageInternal(btree, buf, stack,
1149                                                                            insertdata, updateblkno,
1150                                                                            newlpage, newrpage);
1151 }
1152
1153 /*
1154  * Split page and fill WAL record. Returns a new temp buffer filled with data
1155  * that should go to the left page. The original buffer is left untouched.
1156  */
1157 static void
1158 dataSplitPageInternal(GinBtree btree, Buffer origbuf,
1159                                           GinBtreeStack *stack,
1160                                           void *insertdata, BlockNumber updateblkno,
1161                                           Page *newlpage, Page *newrpage)
1162 {
1163         Page            oldpage = BufferGetPage(origbuf);
1164         OffsetNumber off = stack->off;
1165         int                     nitems = GinPageGetOpaque(oldpage)->maxoff;
1166         int                     nleftitems;
1167         int                     nrightitems;
1168         Size            pageSize = PageGetPageSize(oldpage);
1169         ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
1170         ItemPointer bound;
1171         Page            lpage;
1172         Page            rpage;
1173         OffsetNumber separator;
1174         PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1];
1175
1176         lpage = PageGetTempPage(oldpage);
1177         rpage = PageGetTempPage(oldpage);
1178         GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize);
1179         GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize);
1180
1181         /*
1182          * First construct a new list of PostingItems, which includes all the old
1183          * items, and the new item.
1184          */
1185         memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
1186                    (off - 1) * sizeof(PostingItem));
1187
1188         allitems[off - 1] = *((PostingItem *) insertdata);
1189         memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off),
1190                    (nitems - (off - 1)) * sizeof(PostingItem));
1191         nitems++;
1192
1193         /* Update existing downlink to point to next page */
1194         PostingItemSetBlockNumber(&allitems[off], updateblkno);
1195
1196         /*
1197          * When creating a new index, fit as many tuples as possible on the left
1198          * page, on the assumption that the table is scanned from beginning to
1199          * end. This packs the index as tight as possible.
1200          */
1201         if (btree->isBuild && GinPageRightMost(oldpage))
1202                 separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem);
1203         else
1204                 separator = nitems / 2;
1205         nleftitems = separator;
1206         nrightitems = nitems - separator;
1207
1208         memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber),
1209                    allitems,
1210                    nleftitems * sizeof(PostingItem));
1211         GinPageGetOpaque(lpage)->maxoff = nleftitems;
1212         memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber),
1213                    &allitems[separator],
1214                    nrightitems * sizeof(PostingItem));
1215         GinPageGetOpaque(rpage)->maxoff = nrightitems;
1216
1217         /*
1218          * Also set pd_lower for both pages, like GinDataPageAddPostingItem does.
1219          */
1220         GinDataPageSetDataSize(lpage, nleftitems * sizeof(PostingItem));
1221         GinDataPageSetDataSize(rpage, nrightitems * sizeof(PostingItem));
1222
1223         /* set up right bound for left page */
1224         bound = GinDataPageGetRightBound(lpage);
1225         *bound = GinDataPageGetPostingItem(lpage, nleftitems)->key;
1226
1227         /* set up right bound for right page */
1228         *GinDataPageGetRightBound(rpage) = oldbound;
1229
1230         *newlpage = lpage;
1231         *newrpage = rpage;
1232 }
1233
1234 /*
1235  * Construct insertion payload for inserting the downlink for given buffer.
1236  */
1237 static void *
1238 dataPrepareDownlink(GinBtree btree, Buffer lbuf)
1239 {
1240         PostingItem *pitem = palloc(sizeof(PostingItem));
1241         Page            lpage = BufferGetPage(lbuf);
1242
1243         PostingItemSetBlockNumber(pitem, BufferGetBlockNumber(lbuf));
1244         pitem->key = *GinDataPageGetRightBound(lpage);
1245
1246         return pitem;
1247 }
1248
1249 /*
1250  * Fills new root by right bound values from child.
1251  * Also called from ginxlog, should not use btree
1252  */
1253 void
1254 ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage)
1255 {
1256         PostingItem li,
1257                                 ri;
1258
1259         li.key = *GinDataPageGetRightBound(lpage);
1260         PostingItemSetBlockNumber(&li, lblkno);
1261         GinDataPageAddPostingItem(root, &li, InvalidOffsetNumber);
1262
1263         ri.key = *GinDataPageGetRightBound(rpage);
1264         PostingItemSetBlockNumber(&ri, rblkno);
1265         GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber);
1266 }
1267
1268
1269 /*** Functions to work with disassembled leaf pages ***/
1270
1271 /*
1272  * Disassemble page into a disassembledLeaf struct.
1273  */
1274 static disassembledLeaf *
1275 disassembleLeaf(Page page)
1276 {
1277         disassembledLeaf *leaf;
1278         GinPostingList *seg;
1279         Pointer         segbegin;
1280         Pointer         segend;
1281
1282         leaf = palloc0(sizeof(disassembledLeaf));
1283         dlist_init(&leaf->segments);
1284
1285         if (GinPageIsCompressed(page))
1286         {
1287                 /*
1288                  * Create a leafSegment entry for each segment.
1289                  */
1290                 seg = GinDataLeafPageGetPostingList(page);
1291                 segbegin = (Pointer) seg;
1292                 segend = segbegin + GinDataLeafPageGetPostingListSize(page);
1293                 while ((Pointer) seg < segend)
1294                 {
1295                         leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
1296
1297                         seginfo->action = GIN_SEGMENT_UNMODIFIED;
1298                         seginfo->seg = seg;
1299                         seginfo->items = NULL;
1300                         seginfo->nitems = 0;
1301                         dlist_push_tail(&leaf->segments, &seginfo->node);
1302
1303                         seg = GinNextPostingListSegment(seg);
1304                 }
1305                 leaf->oldformat = false;
1306         }
1307         else
1308         {
1309                 /*
1310                  * A pre-9.4 format uncompressed page is represented by a single
1311                  * segment, with an array of items.
1312                  */
1313                 ItemPointer uncompressed;
1314                 int                     nuncompressed;
1315                 leafSegmentInfo *seginfo;
1316
1317                 uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed);
1318
1319                 seginfo = palloc(sizeof(leafSegmentInfo));
1320
1321                 seginfo->action = GIN_SEGMENT_REPLACE;
1322                 seginfo->seg = NULL;
1323                 seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
1324                 memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
1325                 seginfo->nitems = nuncompressed;
1326
1327                 dlist_push_tail(&leaf->segments, &seginfo->node);
1328
1329                 leaf->oldformat = true;
1330         }
1331
1332         return leaf;
1333 }
1334
1335 /*
1336  * Distribute newItems to the segments.
1337  *
1338  * Any segments that acquire new items are decoded, and the new items are
1339  * merged with the old items.
1340  *
1341  * Returns true if any new items were added. False means they were all
1342  * duplicates of existing items on the page.
1343  */
1344 static bool
1345 addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
1346 {
1347         dlist_iter      iter;
1348         ItemPointer nextnew = newItems;
1349         int                     newleft = nNewItems;
1350         bool            modified = false;
1351         leafSegmentInfo *newseg;
1352
1353         /*
1354          * If the page is completely empty, just construct one new segment to hold
1355          * all the new items.
1356          */
1357         if (dlist_is_empty(&leaf->segments))
1358         {
1359                 newseg = palloc(sizeof(leafSegmentInfo));
1360                 newseg->seg = NULL;
1361                 newseg->items = newItems;
1362                 newseg->nitems = nNewItems;
1363                 newseg->action = GIN_SEGMENT_INSERT;
1364                 dlist_push_tail(&leaf->segments, &newseg->node);
1365                 return true;
1366         }
1367
1368         dlist_foreach(iter, &leaf->segments)
1369         {
1370                 leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
1371                 int                     nthis;
1372                 ItemPointer tmpitems;
1373                 int                     ntmpitems;
1374
1375                 /*
1376                  * How many of the new items fall into this segment?
1377                  */
1378                 if (!dlist_has_next(&leaf->segments, iter.cur))
1379                         nthis = newleft;
1380                 else
1381                 {
1382                         leafSegmentInfo *next;
1383                         ItemPointerData next_first;
1384
1385                         next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
1386                                                                  dlist_next_node(&leaf->segments, iter.cur));
1387                         if (next->items)
1388                                 next_first = next->items[0];
1389                         else
1390                         {
1391                                 Assert(next->seg != NULL);
1392                                 next_first = next->seg->first;
1393                         }
1394
1395                         nthis = 0;
1396                         while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0)
1397                                 nthis++;
1398                 }
1399                 if (nthis == 0)
1400                         continue;
1401
1402                 /* Merge the new items with the existing items. */
1403                 if (!cur->items)
1404                         cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
1405
1406                 /*
1407                  * Fast path for the important special case that we're appending to
1408                  * the end of the page: don't let the last segment on the page grow
1409                  * larger than the target, create a new segment before that happens.
1410                  */
1411                 if (!dlist_has_next(&leaf->segments, iter.cur) &&
1412                         ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 &&
1413                         cur->seg != NULL &&
1414                         SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
1415                 {
1416                         newseg = palloc(sizeof(leafSegmentInfo));
1417                         newseg->seg = NULL;
1418                         newseg->items = nextnew;
1419                         newseg->nitems = nthis;
1420                         newseg->action = GIN_SEGMENT_INSERT;
1421                         dlist_push_tail(&leaf->segments, &newseg->node);
1422                         modified = true;
1423                         break;
1424                 }
1425
1426                 tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
1427                                                                                 nextnew, nthis,
1428                                                                                 &ntmpitems);
1429                 if (ntmpitems != cur->nitems)
1430                 {
1431                         /*
1432                          * If there are no duplicates, track the added items so that we
1433                          * can emit a compact ADDITEMS WAL record later on. (it doesn't
1434                          * seem worth re-checking which items were duplicates, if there
1435                          * were any)
1436                          */
1437                         if (ntmpitems == nthis + cur->nitems &&
1438                                 cur->action == GIN_SEGMENT_UNMODIFIED)
1439                         {
1440                                 cur->action = GIN_SEGMENT_ADDITEMS;
1441                                 cur->modifieditems = nextnew;
1442                                 cur->nmodifieditems = nthis;
1443                         }
1444                         else
1445                                 cur->action = GIN_SEGMENT_REPLACE;
1446
1447                         cur->items = tmpitems;
1448                         cur->nitems = ntmpitems;
1449                         cur->seg = NULL;
1450                         modified = true;
1451                 }
1452
1453                 nextnew += nthis;
1454                 newleft -= nthis;
1455                 if (newleft == 0)
1456                         break;
1457         }
1458
1459         return modified;
1460 }
1461
1462 /*
1463  * Recompresses all segments that have been modified.
1464  *
1465  * If not all the items fit on two pages (ie. after split), we store as
1466  * many items as fit, and set *remaining to the first item that didn't fit.
1467  * If all items fit, *remaining is set to invalid.
1468  *
1469  * Returns true if the page has to be split.
1470  */
1471 static bool
1472 leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
1473 {
1474         int                     pgused = 0;
1475         bool            needsplit = false;
1476         dlist_iter      iter;
1477         int                     segsize;
1478         leafSegmentInfo *nextseg;
1479         int                     npacked;
1480         bool            modified;
1481         dlist_node *cur_node;
1482         dlist_node *next_node;
1483
1484         ItemPointerSetInvalid(remaining);
1485
1486         /*
1487          * cannot use dlist_foreach_modify here because we insert adjacent items
1488          * while iterating.
1489          */
1490         for (cur_node = dlist_head_node(&leaf->segments);
1491                  cur_node != NULL;
1492                  cur_node = next_node)
1493         {
1494                 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
1495                                                                                                    cur_node);
1496
1497                 if (dlist_has_next(&leaf->segments, cur_node))
1498                         next_node = dlist_next_node(&leaf->segments, cur_node);
1499                 else
1500                         next_node = NULL;
1501
1502                 /* Compress the posting list, if necessary */
1503                 if (seginfo->action != GIN_SEGMENT_DELETE)
1504                 {
1505                         if (seginfo->seg == NULL)
1506                         {
1507                                 if (seginfo->nitems > GinPostingListSegmentMaxSize)
1508                                         npacked = 0;    /* no chance that it would fit. */
1509                                 else
1510                                 {
1511                                         seginfo->seg = ginCompressPostingList(seginfo->items,
1512                                                                                                                   seginfo->nitems,
1513                                                                                                 GinPostingListSegmentMaxSize,
1514                                                                                                                   &npacked);
1515                                 }
1516                                 if (npacked != seginfo->nitems)
1517                                 {
1518                                         /*
1519                                          * Too large. Compress again to the target size, and
1520                                          * create a new segment to represent the remaining items.
1521                                          * The new segment is inserted after this one, so it will
1522                                          * be processed in the next iteration of this loop.
1523                                          */
1524                                         if (seginfo->seg)
1525                                                 pfree(seginfo->seg);
1526                                         seginfo->seg = ginCompressPostingList(seginfo->items,
1527                                                                                                                   seginfo->nitems,
1528                                                                                          GinPostingListSegmentTargetSize,
1529                                                                                                                   &npacked);
1530                                         if (seginfo->action != GIN_SEGMENT_INSERT)
1531                                                 seginfo->action = GIN_SEGMENT_REPLACE;
1532
1533                                         nextseg = palloc(sizeof(leafSegmentInfo));
1534                                         nextseg->action = GIN_SEGMENT_INSERT;
1535                                         nextseg->seg = NULL;
1536                                         nextseg->items = &seginfo->items[npacked];
1537                                         nextseg->nitems = seginfo->nitems - npacked;
1538                                         next_node = &nextseg->node;
1539                                         dlist_insert_after(cur_node, next_node);
1540                                 }
1541                         }
1542
1543                         /*
1544                          * If the segment is very small, merge it with the next segment.
1545                          */
1546                         if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
1547                         {
1548                                 int                     nmerged;
1549
1550                                 nextseg = dlist_container(leafSegmentInfo, node, next_node);
1551
1552                                 if (seginfo->items == NULL)
1553                                         seginfo->items = ginPostingListDecode(seginfo->seg,
1554                                                                                                                   &seginfo->nitems);
1555                                 if (nextseg->items == NULL)
1556                                         nextseg->items = ginPostingListDecode(nextseg->seg,
1557                                                                                                                   &nextseg->nitems);
1558                                 nextseg->items =
1559                                         ginMergeItemPointers(seginfo->items, seginfo->nitems,
1560                                                                                  nextseg->items, nextseg->nitems,
1561                                                                                  &nmerged);
1562                                 Assert(nmerged == seginfo->nitems + nextseg->nitems);
1563                                 nextseg->nitems = nmerged;
1564                                 nextseg->seg = NULL;
1565
1566                                 nextseg->action = GIN_SEGMENT_REPLACE;
1567                                 nextseg->modifieditems = NULL;
1568                                 nextseg->nmodifieditems = 0;
1569
1570                                 if (seginfo->action == GIN_SEGMENT_INSERT)
1571                                 {
1572                                         dlist_delete(cur_node);
1573                                         continue;
1574                                 }
1575                                 else
1576                                 {
1577                                         seginfo->action = GIN_SEGMENT_DELETE;
1578                                         seginfo->seg = NULL;
1579                                 }
1580                         }
1581
1582                         seginfo->items = NULL;
1583                         seginfo->nitems = 0;
1584                 }
1585
1586                 if (seginfo->action == GIN_SEGMENT_DELETE)
1587                         continue;
1588
1589                 /*
1590                  * OK, we now have a compressed version of this segment ready for
1591                  * copying to the page. Did we exceed the size that fits on one page?
1592                  */
1593                 segsize = SizeOfGinPostingList(seginfo->seg);
1594                 if (pgused + segsize > GinDataPageMaxDataSize)
1595                 {
1596                         if (!needsplit)
1597                         {
1598                                 /* switch to right page */
1599                                 Assert(pgused > 0);
1600                                 leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
1601                                 needsplit = true;
1602                                 leaf->lsize = pgused;
1603                                 pgused = 0;
1604                         }
1605                         else
1606                         {
1607                                 /*
1608                                  * Filled both pages. The last segment we constructed did not
1609                                  * fit.
1610                                  */
1611                                 *remaining = seginfo->seg->first;
1612
1613                                 /*
1614                                  * remove all segments that did not fit from the list.
1615                                  */
1616                                 while (dlist_has_next(&leaf->segments, cur_node))
1617                                         dlist_delete(dlist_next_node(&leaf->segments, cur_node));
1618                                 dlist_delete(cur_node);
1619                                 break;
1620                         }
1621                 }
1622
1623                 pgused += segsize;
1624         }
1625
1626         if (!needsplit)
1627         {
1628                 leaf->lsize = pgused;
1629                 leaf->rsize = 0;
1630         }
1631         else
1632                 leaf->rsize = pgused;
1633
1634         Assert(leaf->lsize <= GinDataPageMaxDataSize);
1635         Assert(leaf->rsize <= GinDataPageMaxDataSize);
1636
1637         /*
1638          * Make a palloc'd copy of every segment after the first modified one,
1639          * because as we start copying items to the original page, we might
1640          * overwrite an existing segment.
1641          */
1642         modified = false;
1643         dlist_foreach(iter, &leaf->segments)
1644         {
1645                 leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
1646                                                                                                    iter.cur);
1647
1648                 if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
1649                 {
1650                         modified = true;
1651                 }
1652                 else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
1653                 {
1654                         GinPostingList *tmp;
1655
1656                         segsize = SizeOfGinPostingList(seginfo->seg);
1657                         tmp = palloc(segsize);
1658                         memcpy(tmp, seginfo->seg, segsize);
1659                         seginfo->seg = tmp;
1660                 }
1661         }
1662
1663         return needsplit;
1664 }
1665
1666
1667 /*** Functions that are exported to the rest of the GIN code ***/
1668
1669 /*
1670  * Creates new posting tree containing the given TIDs. Returns the page
1671  * number of the root of the new posting tree.
1672  *
1673  * items[] must be in sorted order with no duplicates.
1674  */
1675 BlockNumber
1676 createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
1677                                   GinStatsData *buildStats)
1678 {
1679         BlockNumber blkno;
1680         Buffer          buffer;
1681         Page            tmppage;
1682         Page            page;
1683         Pointer         ptr;
1684         int                     nrootitems;
1685         int                     rootsize;
1686
1687         /* Construct the new root page in memory first. */
1688         tmppage = (Page) palloc(BLCKSZ);
1689         GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ);
1690         GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber;
1691
1692         /*
1693          * Write as many of the items to the root page as fit. In segments of max
1694          * GinPostingListSegmentMaxSize bytes each.
1695          */
1696         nrootitems = 0;
1697         rootsize = 0;
1698         ptr = (Pointer) GinDataLeafPageGetPostingList(tmppage);
1699         while (nrootitems < nitems)
1700         {
1701                 GinPostingList *segment;
1702                 int                     npacked;
1703                 int                     segsize;
1704
1705                 segment = ginCompressPostingList(&items[nrootitems],
1706                                                                                  nitems - nrootitems,
1707                                                                                  GinPostingListSegmentMaxSize,
1708                                                                                  &npacked);
1709                 segsize = SizeOfGinPostingList(segment);
1710                 if (rootsize + segsize > GinDataPageMaxDataSize)
1711                         break;
1712
1713                 memcpy(ptr, segment, segsize);
1714                 ptr += segsize;
1715                 rootsize += segsize;
1716                 nrootitems += npacked;
1717                 pfree(segment);
1718         }
1719         GinDataPageSetDataSize(tmppage, rootsize);
1720
1721         /*
1722          * All set. Get a new physical page, and copy the in-memory page to it.
1723          */
1724         buffer = GinNewBuffer(index);
1725         page = BufferGetPage(buffer);
1726         blkno = BufferGetBlockNumber(buffer);
1727
1728         START_CRIT_SECTION();
1729
1730         PageRestoreTempPage(tmppage, page);
1731         MarkBufferDirty(buffer);
1732
1733         if (RelationNeedsWAL(index))
1734         {
1735                 XLogRecPtr      recptr;
1736                 ginxlogCreatePostingTree data;
1737
1738                 data.size = rootsize;
1739
1740                 XLogBeginInsert();
1741                 XLogRegisterData((char *) &data, sizeof(ginxlogCreatePostingTree));
1742
1743                 XLogRegisterData((char *) GinDataLeafPageGetPostingList(page),
1744                                                  rootsize);
1745                 XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
1746
1747                 recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE);
1748                 PageSetLSN(page, recptr);
1749         }
1750
1751         UnlockReleaseBuffer(buffer);
1752
1753         END_CRIT_SECTION();
1754
1755         /* During index build, count the newly-added data page */
1756         if (buildStats)
1757                 buildStats->nDataPages++;
1758
1759         elog(DEBUG2, "created GIN posting tree with %d items", nrootitems);
1760
1761         /*
1762          * Add any remaining TIDs to the newly-created posting tree.
1763          */
1764         if (nitems > nrootitems)
1765         {
1766                 ginInsertItemPointers(index, blkno,
1767                                                           items + nrootitems,
1768                                                           nitems - nrootitems,
1769                                                           buildStats);
1770         }
1771
1772         return blkno;
1773 }
1774
1775 void
1776 ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
1777 {
1778         memset(btree, 0, sizeof(GinBtreeData));
1779
1780         btree->index = index;
1781         btree->rootBlkno = rootBlkno;
1782
1783         btree->findChildPage = dataLocateItem;
1784         btree->getLeftMostChild = dataGetLeftMostPage;
1785         btree->isMoveRight = dataIsMoveRight;
1786         btree->findItem = NULL;
1787         btree->findChildPtr = dataFindChildPtr;
1788         btree->placeToPage = dataPlaceToPage;
1789         btree->fillRoot = ginDataFillRoot;
1790         btree->prepareDownlink = dataPrepareDownlink;
1791
1792         btree->isData = TRUE;
1793         btree->fullScan = FALSE;
1794         btree->isBuild = FALSE;
1795 }
1796
1797 /*
1798  * Inserts array of item pointers, may execute several tree scan (very rare)
1799  */
1800 void
1801 ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
1802                                           ItemPointerData *items, uint32 nitem,
1803                                           GinStatsData *buildStats)
1804 {
1805         GinBtreeData btree;
1806         GinBtreeDataLeafInsertData insertdata;
1807         GinBtreeStack *stack;
1808
1809         ginPrepareDataScan(&btree, index, rootBlkno);
1810         btree.isBuild = (buildStats != NULL);
1811         insertdata.items = items;
1812         insertdata.nitem = nitem;
1813         insertdata.curitem = 0;
1814
1815         while (insertdata.curitem < insertdata.nitem)
1816         {
1817                 /* search for the leaf page where the first item should go to */
1818                 btree.itemptr = insertdata.items[insertdata.curitem];
1819                 stack = ginFindLeafPage(&btree, false);
1820
1821                 ginInsertValue(&btree, stack, &insertdata, buildStats);
1822         }
1823 }
1824
1825 /*
1826  * Starts a new scan on a posting tree.
1827  */
1828 GinBtreeStack *
1829 ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno)
1830 {
1831         GinBtreeStack *stack;
1832
1833         ginPrepareDataScan(btree, index, rootBlkno);
1834
1835         btree->fullScan = TRUE;
1836
1837         stack = ginFindLeafPage(btree, TRUE);
1838
1839         return stack;
1840 }