1 /*-------------------------------------------------------------------------
4 * WAL replay logic for inverted index.
7 * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gin/ginxlog.c
12 *-------------------------------------------------------------------------
16 #include "access/gin_private.h"
17 #include "access/xlogutils.h"
18 #include "utils/memutils.h"
20 static MemoryContext opCtx; /* working memory for operations */
23 ginRedoClearIncompleteSplit(XLogReaderState *record, uint8 block_id)
25 XLogRecPtr lsn = record->EndRecPtr;
29 if (XLogReadBufferForRedo(record, block_id, &buffer) == BLK_NEEDS_REDO)
31 page = (Page) BufferGetPage(buffer);
32 GinPageGetOpaque(page)->flags &= ~GIN_INCOMPLETE_SPLIT;
34 PageSetLSN(page, lsn);
35 MarkBufferDirty(buffer);
37 if (BufferIsValid(buffer))
38 UnlockReleaseBuffer(buffer);
42 ginRedoCreateIndex(XLogReaderState *record)
44 XLogRecPtr lsn = record->EndRecPtr;
49 MetaBuffer = XLogInitBufferForRedo(record, 0);
50 Assert(BufferGetBlockNumber(MetaBuffer) == GIN_METAPAGE_BLKNO);
51 page = (Page) BufferGetPage(MetaBuffer);
53 GinInitMetabuffer(MetaBuffer);
55 PageSetLSN(page, lsn);
56 MarkBufferDirty(MetaBuffer);
58 RootBuffer = XLogInitBufferForRedo(record, 1);
59 Assert(BufferGetBlockNumber(RootBuffer) == GIN_ROOT_BLKNO);
60 page = (Page) BufferGetPage(RootBuffer);
62 GinInitBuffer(RootBuffer, GIN_LEAF);
64 PageSetLSN(page, lsn);
65 MarkBufferDirty(RootBuffer);
67 UnlockReleaseBuffer(RootBuffer);
68 UnlockReleaseBuffer(MetaBuffer);
72 ginRedoCreatePTree(XLogReaderState *record)
74 XLogRecPtr lsn = record->EndRecPtr;
75 ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record);
80 buffer = XLogInitBufferForRedo(record, 0);
81 page = (Page) BufferGetPage(buffer);
83 GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED);
85 ptr = XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree);
88 memcpy(GinDataLeafPageGetPostingList(page), ptr, data->size);
90 GinDataPageSetDataSize(page, data->size);
92 PageSetLSN(page, lsn);
94 MarkBufferDirty(buffer);
95 UnlockReleaseBuffer(buffer);
99 ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
101 Page page = BufferGetPage(buffer);
102 ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata;
103 OffsetNumber offset = data->offset;
106 if (rightblkno != InvalidBlockNumber)
108 /* update link to right page after split */
109 Assert(!GinPageIsLeaf(page));
110 Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page));
111 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offset));
112 GinSetDownlink(itup, rightblkno);
117 Assert(GinPageIsLeaf(page));
118 Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page));
119 PageIndexTupleDelete(page, offset);
124 if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber)
130 BufferGetTag(buffer, &node, &forknum, &blknum);
131 elog(ERROR, "failed to add item to index page in %u/%u/%u",
132 node.spcNode, node.dbNode, node.relNode);
137 ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
141 GinPostingList *oldseg;
147 * If the page is in pre-9.4 format, convert to new format first.
149 if (!GinPageIsCompressed(page))
151 ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page);
152 int nuncompressed = GinPageGetOpaque(page)->maxoff;
154 GinPostingList *plist;
156 plist = ginCompressPostingList(uncompressed, nuncompressed,
158 Assert(npacked == nuncompressed);
160 totalsize = SizeOfGinPostingList(plist);
162 memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize);
163 GinDataPageSetDataSize(page, totalsize);
164 GinPageSetCompressed(page);
165 GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
168 oldseg = GinDataLeafPageGetPostingList(page);
169 segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page);
172 walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf);
173 for (actionno = 0; actionno < data->nactions; actionno++)
175 uint8 a_segno = *((uint8 *) (walbuf++));
176 uint8 a_action = *((uint8 *) (walbuf++));
177 GinPostingList *newseg = NULL;
179 ItemPointerData *items = NULL;
181 ItemPointerData *olditems;
183 ItemPointerData *newitems;
189 /* Extract all the information we need from the WAL record */
190 if (a_action == GIN_SEGMENT_INSERT ||
191 a_action == GIN_SEGMENT_REPLACE)
193 newseg = (GinPostingList *) walbuf;
194 newsegsize = SizeOfGinPostingList(newseg);
195 walbuf += SHORTALIGN(newsegsize);
198 if (a_action == GIN_SEGMENT_ADDITEMS)
200 memcpy(&nitems, walbuf, sizeof(uint16));
201 walbuf += sizeof(uint16);
202 items = (ItemPointerData *) walbuf;
203 walbuf += nitems * sizeof(ItemPointerData);
206 /* Skip to the segment that this action concerns */
207 Assert(segno <= a_segno);
208 while (segno < a_segno)
210 oldseg = GinNextPostingListSegment(oldseg);
215 * ADDITEMS action is handled like REPLACE, but the new segment to
216 * replace the old one is reconstructed using the old segment from
217 * disk and the new items from the WAL record.
219 if (a_action == GIN_SEGMENT_ADDITEMS)
223 olditems = ginPostingListDecode(oldseg, &nolditems);
225 newitems = ginMergeItemPointers(items, nitems,
228 Assert(nnewitems == nolditems + nitems);
230 newseg = ginCompressPostingList(newitems, nnewitems,
232 Assert(npacked == nnewitems);
234 newsegsize = SizeOfGinPostingList(newseg);
235 a_action = GIN_SEGMENT_REPLACE;
238 segptr = (Pointer) oldseg;
239 if (segptr != segmentend)
240 segsize = SizeOfGinPostingList(oldseg);
244 * Positioned after the last existing segment. Only INSERTs
247 Assert(a_action == GIN_SEGMENT_INSERT);
250 szleft = segmentend - segptr;
254 case GIN_SEGMENT_DELETE:
255 memmove(segptr, segptr + segsize, szleft - segsize);
256 segmentend -= segsize;
261 case GIN_SEGMENT_INSERT:
262 /* make room for the new segment */
263 memmove(segptr + newsegsize, segptr, szleft);
264 /* copy the new segment in place */
265 memcpy(segptr, newseg, newsegsize);
266 segmentend += newsegsize;
267 segptr += newsegsize;
270 case GIN_SEGMENT_REPLACE:
271 /* shift the segments that follow */
272 memmove(segptr + newsegsize,
275 /* copy the replacement segment in place */
276 memcpy(segptr, newseg, newsegsize);
277 segmentend -= segsize;
278 segmentend += newsegsize;
279 segptr += newsegsize;
284 elog(ERROR, "unexpected GIN leaf action: %u", a_action);
286 oldseg = (GinPostingList *) segptr;
289 totalsize = segmentend - (Pointer) GinDataLeafPageGetPostingList(page);
290 GinDataPageSetDataSize(page, totalsize);
294 ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
296 Page page = BufferGetPage(buffer);
300 ginxlogRecompressDataLeaf *data = (ginxlogRecompressDataLeaf *) rdata;
302 Assert(GinPageIsLeaf(page));
304 ginRedoRecompress(page, data);
308 ginxlogInsertDataInternal *data = (ginxlogInsertDataInternal *) rdata;
309 PostingItem *oldpitem;
311 Assert(!GinPageIsLeaf(page));
313 /* update link to right page after split */
314 oldpitem = GinDataPageGetPostingItem(page, data->offset);
315 PostingItemSetBlockNumber(oldpitem, rightblkno);
317 GinDataPageAddPostingItem(page, &data->newitem, data->offset);
322 ginRedoInsert(XLogReaderState *record)
324 XLogRecPtr lsn = record->EndRecPtr;
325 ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record);
328 BlockNumber leftChildBlkno = InvalidBlockNumber;
330 BlockNumber rightChildBlkno = InvalidBlockNumber;
331 bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
334 * First clear incomplete-split flag on child page if this finishes a
339 char *payload = XLogRecGetData(record) + sizeof(ginxlogInsert);
342 leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload);
344 payload += sizeof(BlockIdData);
345 rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload);
346 payload += sizeof(BlockIdData);
348 ginRedoClearIncompleteSplit(record, 1);
351 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
353 Page page = BufferGetPage(buffer);
355 char *payload = XLogRecGetBlockData(record, 0, &len);
357 /* How to insert the payload is tree-type specific */
358 if (data->flags & GIN_INSERT_ISDATA)
360 Assert(GinPageIsData(page));
361 ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload);
365 Assert(!GinPageIsData(page));
366 ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload);
369 PageSetLSN(page, lsn);
370 MarkBufferDirty(buffer);
372 if (BufferIsValid(buffer))
373 UnlockReleaseBuffer(buffer);
377 ginRedoSplit(XLogReaderState *record)
379 ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record);
383 bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
384 bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0;
387 * First clear incomplete-split flag on child page if this finishes a
391 ginRedoClearIncompleteSplit(record, 3);
393 if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
394 elog(ERROR, "GIN split record did not contain a full-page image of left page");
396 if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED)
397 elog(ERROR, "GIN split record did not contain a full-page image of right page");
401 if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED)
402 elog(ERROR, "GIN split record did not contain a full-page image of root page");
403 UnlockReleaseBuffer(rootbuf);
406 UnlockReleaseBuffer(rbuffer);
407 UnlockReleaseBuffer(lbuffer);
411 * VACUUM_PAGE record contains simply a full image of the page, similar to
415 ginRedoVacuumPage(XLogReaderState *record)
419 if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
421 elog(ERROR, "replay of gin entry tree page vacuum did not restore the page");
423 UnlockReleaseBuffer(buffer);
427 ginRedoVacuumDataLeafPage(XLogReaderState *record)
429 XLogRecPtr lsn = record->EndRecPtr;
432 if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
434 Page page = BufferGetPage(buffer);
436 ginxlogVacuumDataLeafPage *xlrec;
438 xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, &len);
440 Assert(GinPageIsLeaf(page));
441 Assert(GinPageIsData(page));
443 ginRedoRecompress(page, &xlrec->data);
444 PageSetLSN(page, lsn);
445 MarkBufferDirty(buffer);
447 if (BufferIsValid(buffer))
448 UnlockReleaseBuffer(buffer);
452 ginRedoDeletePage(XLogReaderState *record)
454 XLogRecPtr lsn = record->EndRecPtr;
455 ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record);
461 if (XLogReadBufferForRedo(record, 0, &dbuffer) == BLK_NEEDS_REDO)
463 page = BufferGetPage(dbuffer);
464 Assert(GinPageIsData(page));
465 GinPageGetOpaque(page)->flags = GIN_DELETED;
466 PageSetLSN(page, lsn);
467 MarkBufferDirty(dbuffer);
470 if (XLogReadBufferForRedo(record, 1, &pbuffer) == BLK_NEEDS_REDO)
472 page = BufferGetPage(pbuffer);
473 Assert(GinPageIsData(page));
474 Assert(!GinPageIsLeaf(page));
475 GinPageDeletePostingItem(page, data->parentOffset);
476 PageSetLSN(page, lsn);
477 MarkBufferDirty(pbuffer);
480 if (XLogReadBufferForRedo(record, 2, &lbuffer) == BLK_NEEDS_REDO)
482 page = BufferGetPage(lbuffer);
483 Assert(GinPageIsData(page));
484 GinPageGetOpaque(page)->rightlink = data->rightLink;
485 PageSetLSN(page, lsn);
486 MarkBufferDirty(lbuffer);
489 if (BufferIsValid(lbuffer))
490 UnlockReleaseBuffer(lbuffer);
491 if (BufferIsValid(pbuffer))
492 UnlockReleaseBuffer(pbuffer);
493 if (BufferIsValid(dbuffer))
494 UnlockReleaseBuffer(dbuffer);
498 ginRedoUpdateMetapage(XLogReaderState *record)
500 XLogRecPtr lsn = record->EndRecPtr;
501 ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record);
507 * Restore the metapage. This is essentially the same as a full-page
508 * image, so restore the metapage unconditionally without looking at the
509 * LSN, to avoid torn page hazards.
511 metabuffer = XLogInitBufferForRedo(record, 0);
512 Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO);
513 metapage = BufferGetPage(metabuffer);
515 memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
516 PageSetLSN(metapage, lsn);
517 MarkBufferDirty(metabuffer);
519 if (data->ntuples > 0)
522 * insert into tail page
524 if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
526 Page page = BufferGetPage(buffer);
534 payload = XLogRecGetBlockData(record, 1, &totaltupsize);
535 tuples = (IndexTuple) payload;
537 if (PageIsEmpty(page))
538 off = FirstOffsetNumber;
540 off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
542 for (i = 0; i < data->ntuples; i++)
544 tupsize = IndexTupleSize(tuples);
546 if (PageAddItem(page, (Item) tuples, tupsize, off,
547 false, false) == InvalidOffsetNumber)
548 elog(ERROR, "failed to add item to index page");
550 tuples = (IndexTuple) (((char *) tuples) + tupsize);
554 Assert(payload + totaltupsize == (char *) tuples);
557 * Increase counter of heap tuples
559 GinPageGetOpaque(page)->maxoff++;
561 PageSetLSN(page, lsn);
562 MarkBufferDirty(buffer);
564 if (BufferIsValid(buffer))
565 UnlockReleaseBuffer(buffer);
567 else if (data->prevTail != InvalidBlockNumber)
572 if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
574 Page page = BufferGetPage(buffer);
576 GinPageGetOpaque(page)->rightlink = data->newRightlink;
578 PageSetLSN(page, lsn);
579 MarkBufferDirty(buffer);
581 if (BufferIsValid(buffer))
582 UnlockReleaseBuffer(buffer);
585 UnlockReleaseBuffer(metabuffer);
589 ginRedoInsertListPage(XLogReaderState *record)
591 XLogRecPtr lsn = record->EndRecPtr;
592 ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record);
596 off = FirstOffsetNumber;
603 /* We always re-initialize the page. */
604 buffer = XLogInitBufferForRedo(record, 0);
605 page = BufferGetPage(buffer);
607 GinInitBuffer(buffer, GIN_LIST);
608 GinPageGetOpaque(page)->rightlink = data->rightlink;
609 if (data->rightlink == InvalidBlockNumber)
611 /* tail of sublist */
612 GinPageSetFullRow(page);
613 GinPageGetOpaque(page)->maxoff = 1;
617 GinPageGetOpaque(page)->maxoff = 0;
620 payload = XLogRecGetBlockData(record, 0, &totaltupsize);
622 tuples = (IndexTuple) payload;
623 for (i = 0; i < data->ntuples; i++)
625 tupsize = IndexTupleSize(tuples);
627 l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
629 if (l == InvalidOffsetNumber)
630 elog(ERROR, "failed to add item to index page");
632 tuples = (IndexTuple) (((char *) tuples) + tupsize);
635 Assert((char *) tuples == payload + totaltupsize);
637 PageSetLSN(page, lsn);
638 MarkBufferDirty(buffer);
640 UnlockReleaseBuffer(buffer);
644 ginRedoDeleteListPages(XLogReaderState *record)
646 XLogRecPtr lsn = record->EndRecPtr;
647 ginxlogDeleteListPages *data = (ginxlogDeleteListPages *) XLogRecGetData(record);
652 metabuffer = XLogInitBufferForRedo(record, 0);
653 Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO);
654 metapage = BufferGetPage(metabuffer);
656 GinInitPage(metapage, GIN_META, BufferGetPageSize(metabuffer));
658 memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
659 PageSetLSN(metapage, lsn);
660 MarkBufferDirty(metabuffer);
663 * In normal operation, shiftList() takes exclusive lock on all the
664 * pages-to-be-deleted simultaneously. During replay, however, it should
665 * be all right to lock them one at a time. This is dependent on the fact
666 * that we are deleting pages from the head of the list, and that readers
667 * share-lock the next page before releasing the one they are on. So we
668 * cannot get past a reader that is on, or due to visit, any page we are
669 * going to delete. New incoming readers will block behind our metapage
670 * lock and then see a fully updated page list.
672 * No full-page images are taken of the deleted pages. Instead, they are
673 * re-initialized as empty, deleted pages. Their right-links don't need to
674 * be preserved, because no new readers can see the pages, as explained
677 for (i = 0; i < data->ndeleted; i++)
682 buffer = XLogInitBufferForRedo(record, i + 1);
683 page = BufferGetPage(buffer);
684 GinInitBuffer(buffer, GIN_DELETED);
686 PageSetLSN(page, lsn);
687 MarkBufferDirty(buffer);
689 UnlockReleaseBuffer(buffer);
691 UnlockReleaseBuffer(metabuffer);
695 gin_redo(XLogReaderState *record)
697 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
698 MemoryContext oldCtx;
701 * GIN indexes do not require any conflict processing. NB: If we ever
702 * implement a similar optimization as we have in b-tree, and remove
703 * killed tuples outside VACUUM, we'll need to handle that here.
706 oldCtx = MemoryContextSwitchTo(opCtx);
709 case XLOG_GIN_CREATE_INDEX:
710 ginRedoCreateIndex(record);
712 case XLOG_GIN_CREATE_PTREE:
713 ginRedoCreatePTree(record);
715 case XLOG_GIN_INSERT:
716 ginRedoInsert(record);
719 ginRedoSplit(record);
721 case XLOG_GIN_VACUUM_PAGE:
722 ginRedoVacuumPage(record);
724 case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
725 ginRedoVacuumDataLeafPage(record);
727 case XLOG_GIN_DELETE_PAGE:
728 ginRedoDeletePage(record);
730 case XLOG_GIN_UPDATE_META_PAGE:
731 ginRedoUpdateMetapage(record);
733 case XLOG_GIN_INSERT_LISTPAGE:
734 ginRedoInsertListPage(record);
736 case XLOG_GIN_DELETE_LISTPAGE:
737 ginRedoDeleteListPages(record);
740 elog(PANIC, "gin_redo: unknown op code %u", info);
742 MemoryContextSwitchTo(oldCtx);
743 MemoryContextReset(opCtx);
747 gin_xlog_startup(void)
749 opCtx = AllocSetContextCreate(CurrentMemoryContext,
750 "GIN recovery temporary context",
751 ALLOCSET_DEFAULT_MINSIZE,
752 ALLOCSET_DEFAULT_INITSIZE,
753 ALLOCSET_DEFAULT_MAXSIZE);
757 gin_xlog_cleanup(void)
759 MemoryContextDelete(opCtx);