1 /*-------------------------------------------------------------------------
4 * WAL replay logic for btrees.
7 * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/nbtree/nbtxlog.c
13 *-------------------------------------------------------------------------
17 #include "access/heapam_xlog.h"
18 #include "access/nbtree.h"
19 #include "access/transam.h"
20 #include "storage/procarray.h"
21 #include "miscadmin.h"
24 * _bt_restore_page -- re-enter all the index tuples on a page
26 * The page is freshly init'd, and *from (length len) is a copy of what
27 * had been its upper part (pd_upper to pd_special). We assume that the
28 * tuples had been added to the page in item-number order, and therefore
29 * the one with highest item number appears first (lowest on the page).
32 _bt_restore_page(Page page, char *from, int len)
34 IndexTupleData itupdata;
36 char *end = from + len;
37 Item items[MaxIndexTuplesPerPage];
38 uint16 itemsizes[MaxIndexTuplesPerPage];
43 * To get the items back in the original order, we add them to the page in
44 * reverse. To figure out where one tuple ends and another begins, we
45 * have to scan them in forward order first.
50 /* Need to copy tuple header due to alignment considerations */
51 memcpy(&itupdata, from, sizeof(IndexTupleData));
52 itemsz = IndexTupleDSize(itupdata);
53 itemsz = MAXALIGN(itemsz);
55 items[i] = (Item) from;
56 itemsizes[i] = itemsz;
63 for (i = nitems - 1; i >= 0; i--)
65 if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
66 false, false) == InvalidOffsetNumber)
67 elog(PANIC, "_bt_restore_page: cannot add item to page");
73 _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn,
74 BlockNumber root, uint32 level,
75 BlockNumber fastroot, uint32 fastlevel)
82 metabuf = XLogReadBuffer(rnode, BTREE_METAPAGE, true);
83 Assert(BufferIsValid(metabuf));
84 metapg = BufferGetPage(metabuf);
86 _bt_pageinit(metapg, BufferGetPageSize(metabuf));
88 md = BTPageGetMeta(metapg);
89 md->btm_magic = BTREE_MAGIC;
90 md->btm_version = BTREE_VERSION;
92 md->btm_level = level;
93 md->btm_fastroot = fastroot;
94 md->btm_fastlevel = fastlevel;
96 pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
97 pageop->btpo_flags = BTP_META;
100 * Set pd_lower just past the end of the metadata. This is not essential
101 * but it makes the page look compressible to xlog.c.
103 ((PageHeader) metapg)->pd_lower =
104 ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
106 PageSetLSN(metapg, lsn);
107 MarkBufferDirty(metabuf);
108 UnlockReleaseBuffer(metabuf);
112 * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
114 * This is a common subroutine of the redo functions of all the WAL record
115 * types that can insert a downlink: insert, split, and newroot.
118 _bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record,
119 RelFileNode rnode, BlockNumber cblock)
123 buf = XLogReadBuffer(rnode, cblock, false);
124 if (BufferIsValid(buf))
126 Page page = (Page) BufferGetPage(buf);
128 if (lsn > PageGetLSN(page))
130 BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
132 Assert((pageop->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0);
133 pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
135 PageSetLSN(page, lsn);
136 MarkBufferDirty(buf);
138 UnlockReleaseBuffer(buf);
143 btree_xlog_insert(bool isleaf, bool ismeta,
144 XLogRecPtr lsn, XLogRecord *record)
146 xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
151 xl_btree_metadata md;
152 BlockNumber cblkno = 0;
155 datapos = (char *) xlrec + SizeOfBtreeInsert;
156 datalen = record->xl_len - SizeOfBtreeInsert;
159 * if this insert finishes a split at lower level, extract the block
160 * number of the (left) child.
162 if (!isleaf && (record->xl_info & XLR_BKP_BLOCK(0)) == 0)
164 memcpy(&cblkno, datapos, sizeof(BlockNumber));
166 datapos += sizeof(BlockNumber);
167 datalen -= sizeof(BlockNumber);
171 memcpy(&md, datapos, sizeof(xl_btree_metadata));
172 datapos += sizeof(xl_btree_metadata);
173 datalen -= sizeof(xl_btree_metadata);
177 * Insertion to an internal page finishes an incomplete split at the child
178 * level. Clear the incomplete-split flag in the child. Note: during
179 * normal operation, the child and parent pages are locked at the same
180 * time, so that clearing the flag and inserting the downlink appear
181 * atomic to other backends. We don't bother with that during replay,
182 * because readers don't care about the incomplete-split flag and there
183 * cannot be updates happening.
187 if (record->xl_info & XLR_BKP_BLOCK(0))
188 (void) RestoreBackupBlock(lsn, record, 0, false, false);
190 _bt_clear_incomplete_split(lsn, record, xlrec->target.node, cblkno);
196 if (record->xl_info & XLR_BKP_BLOCK(main_blk_index))
197 (void) RestoreBackupBlock(lsn, record, main_blk_index, false, false);
200 buffer = XLogReadBuffer(xlrec->target.node,
201 ItemPointerGetBlockNumber(&(xlrec->target.tid)),
203 if (BufferIsValid(buffer))
205 page = (Page) BufferGetPage(buffer);
207 if (lsn > PageGetLSN(page))
209 if (PageAddItem(page, (Item) datapos, datalen,
210 ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
211 false, false) == InvalidOffsetNumber)
212 elog(PANIC, "btree_insert_redo: failed to add item");
214 PageSetLSN(page, lsn);
215 MarkBufferDirty(buffer);
217 UnlockReleaseBuffer(buffer);
222 * Note: in normal operation, we'd update the metapage while still holding
223 * lock on the page we inserted into. But during replay it's not
224 * necessary to hold that lock, since no other index updates can be
225 * happening concurrently, and readers will cope fine with following an
226 * obsolete link from the metapage.
229 _bt_restore_meta(xlrec->target.node, lsn,
231 md.fastroot, md.fastlevel);
235 btree_xlog_split(bool onleft, bool isroot,
236 XLogRecPtr lsn, XLogRecord *record)
238 xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
239 bool isleaf = (xlrec->level == 0);
243 BTPageOpaque ropaque;
246 OffsetNumber newitemoff = 0;
249 Item left_hikey = NULL;
250 Size left_hikeysz = 0;
251 BlockNumber cblkno = InvalidBlockNumber;
253 datapos = (char *) xlrec + SizeOfBtreeSplit;
254 datalen = record->xl_len - SizeOfBtreeSplit;
256 /* Extract newitemoff and newitem, if present */
259 memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
260 datapos += sizeof(OffsetNumber);
261 datalen -= sizeof(OffsetNumber);
263 if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0)))
266 * We assume that 16-bit alignment is enough to apply IndexTupleSize
267 * (since it's fetching from a uint16 field) and also enough for
268 * PageAddItem to insert the tuple.
270 newitem = (Item) datapos;
271 newitemsz = MAXALIGN(IndexTupleSize(newitem));
272 datapos += newitemsz;
273 datalen -= newitemsz;
276 /* Extract left hikey and its size (still assuming 16-bit alignment) */
277 if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(0)))
279 left_hikey = (Item) datapos;
280 left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
281 datapos += left_hikeysz;
282 datalen -= left_hikeysz;
286 * If this insertion finishes an incomplete split, get the block number of
289 if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(1)))
291 memcpy(&cblkno, datapos, sizeof(BlockNumber));
292 datapos += sizeof(BlockNumber);
293 datalen -= sizeof(BlockNumber);
297 * Clear the incomplete split flag on the left sibling of the child page
298 * this is a downlink for. (Like in btree_xlog_insert, this can be done
299 * before locking the other pages)
303 if (record->xl_info & XLR_BKP_BLOCK(1))
304 (void) RestoreBackupBlock(lsn, record, 1, false, false);
306 _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno);
309 /* Reconstruct right (new) sibling page from scratch */
310 rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true);
311 Assert(BufferIsValid(rbuf));
312 rpage = (Page) BufferGetPage(rbuf);
314 _bt_pageinit(rpage, BufferGetPageSize(rbuf));
315 ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
317 ropaque->btpo_prev = xlrec->leftsib;
318 ropaque->btpo_next = xlrec->rnext;
319 ropaque->btpo.level = xlrec->level;
320 ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
321 ropaque->btpo_cycleid = 0;
323 _bt_restore_page(rpage, datapos, datalen);
326 * On leaf level, the high key of the left page is equal to the first key
331 ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
333 left_hikey = PageGetItem(rpage, hiItemId);
334 left_hikeysz = ItemIdGetLength(hiItemId);
337 PageSetLSN(rpage, lsn);
338 MarkBufferDirty(rbuf);
340 /* don't release the buffer yet; we touch right page's first item below */
342 /* Now reconstruct left (original) sibling page */
343 if (record->xl_info & XLR_BKP_BLOCK(0))
344 lbuf = RestoreBackupBlock(lsn, record, 0, false, true);
347 lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false);
349 if (BufferIsValid(lbuf))
352 * To retain the same physical order of the tuples that they had,
353 * we initialize a temporary empty page for the left page and add
354 * all the items to that in item number order. This mirrors how
355 * _bt_split() works. It's not strictly required to retain the
356 * same physical order, as long as the items are in the correct
357 * item number order, but it helps debugging. See also
358 * _bt_restore_page(), which does the same for the right page.
360 Page lpage = (Page) BufferGetPage(lbuf);
361 BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
363 if (lsn > PageGetLSN(lpage))
367 OffsetNumber leftoff;
369 newlpage = PageGetTempPageCopySpecial(lpage);
373 if (PageAddItem(newlpage, left_hikey, left_hikeysz,
374 P_HIKEY, false, false) == InvalidOffsetNumber)
375 elog(PANIC, "failed to add high key to left page after split");
376 leftoff = OffsetNumberNext(leftoff);
378 for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++)
384 /* add the new item if it was inserted on left page */
385 if (onleft && off == newitemoff)
387 if (PageAddItem(newlpage, newitem, newitemsz, leftoff,
388 false, false) == InvalidOffsetNumber)
389 elog(ERROR, "failed to add new item to left page after split");
390 leftoff = OffsetNumberNext(leftoff);
393 itemid = PageGetItemId(lpage, off);
394 itemsz = ItemIdGetLength(itemid);
395 item = PageGetItem(lpage, itemid);
396 if (PageAddItem(newlpage, item, itemsz, leftoff,
397 false, false) == InvalidOffsetNumber)
398 elog(ERROR, "failed to add old item to left page after split");
399 leftoff = OffsetNumberNext(leftoff);
402 /* cope with possibility that newitem goes at the end */
403 if (onleft && off == newitemoff)
405 if (PageAddItem(newlpage, newitem, newitemsz, leftoff,
406 false, false) == InvalidOffsetNumber)
407 elog(ERROR, "failed to add new item to left page after split");
408 leftoff = OffsetNumberNext(leftoff);
411 PageRestoreTempPage(newlpage, lpage);
413 /* Fix opaque fields */
414 lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
416 lopaque->btpo_flags |= BTP_LEAF;
417 lopaque->btpo_next = xlrec->rightsib;
418 lopaque->btpo_cycleid = 0;
420 PageSetLSN(lpage, lsn);
421 MarkBufferDirty(lbuf);
426 /* We no longer need the buffers */
427 if (BufferIsValid(lbuf))
428 UnlockReleaseBuffer(lbuf);
429 UnlockReleaseBuffer(rbuf);
432 * Fix left-link of the page to the right of the new right sibling.
434 * Note: in normal operation, we do this while still holding lock on the
435 * two split pages. However, that's not necessary for correctness in WAL
436 * replay, because no other index update can be in progress, and readers
437 * will cope properly when following an obsolete left-link.
439 if (xlrec->rnext != P_NONE)
442 * the backup block containing right sibling is 1 or 2, depending
443 * whether this was a leaf or internal page.
445 int rnext_index = isleaf ? 1 : 2;
447 if (record->xl_info & XLR_BKP_BLOCK(rnext_index))
448 (void) RestoreBackupBlock(lsn, record, rnext_index, false, false);
453 buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false);
455 if (BufferIsValid(buffer))
457 Page page = (Page) BufferGetPage(buffer);
459 if (lsn > PageGetLSN(page))
461 BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
463 pageop->btpo_prev = xlrec->rightsib;
465 PageSetLSN(page, lsn);
466 MarkBufferDirty(buffer);
468 UnlockReleaseBuffer(buffer);
475 btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
477 xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
483 * If queries might be active then we need to ensure every leaf page is
484 * unpinned between the lastBlockVacuumed and the current block, if there
485 * are any. This prevents replay of the VACUUM from reaching the stage of
486 * removing heap tuples while there could still be indexscans "in flight"
487 * to those particular tuples (see nbtree/README).
489 * It might be worth checking if there are actually any backends running;
490 * if not, we could just skip this.
492 * Since VACUUM can visit leaf pages out-of-order, it might issue records
493 * with lastBlockVacuumed >= block; that's not an error, it just means
496 * Note: since we touch all pages in the range, we will lock non-leaf
497 * pages, and also any empty (all-zero) pages that may be in the index. It
498 * doesn't seem worth the complexity to avoid that. But it's important
499 * that HotStandbyActiveInReplay() will not return true if the database
500 * isn't yet consistent; so we need not fear reading still-corrupt blocks
501 * here during crash recovery.
503 if (HotStandbyActiveInReplay())
507 for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
510 * We use RBM_NORMAL_NO_LOG mode because it's not an error
511 * condition to see all-zero pages. The original btvacuumpage
512 * scan would have skipped over all-zero pages, noting them in FSM
513 * but not bothering to initialize them just yet; so we mustn't
514 * throw an error here. (We could skip acquiring the cleanup lock
515 * if PageIsNew, but it's probably not worth the cycles to test.)
517 * XXX we don't actually need to read the block, we just need to
518 * confirm it is unpinned. If we had a special call into the
519 * buffer manager we could optimise this so that if the block is
520 * not in shared_buffers we confirm it as unpinned.
522 buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
524 if (BufferIsValid(buffer))
526 LockBufferForCleanup(buffer);
527 UnlockReleaseBuffer(buffer);
533 * If we have a full-page image, restore it (using a cleanup lock) and
536 if (record->xl_info & XLR_BKP_BLOCK(0))
538 (void) RestoreBackupBlock(lsn, record, 0, true, false);
543 * Like in btvacuumpage(), we need to take a cleanup lock on every leaf
544 * page. See nbtree/README for details.
546 buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
547 if (!BufferIsValid(buffer))
549 LockBufferForCleanup(buffer);
550 page = (Page) BufferGetPage(buffer);
552 if (lsn <= PageGetLSN(page))
554 UnlockReleaseBuffer(buffer);
558 if (record->xl_len > SizeOfBtreeVacuum)
560 OffsetNumber *unused;
563 unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum);
564 unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
566 if ((unend - unused) > 0)
567 PageIndexMultiDelete(page, unused, unend - unused);
571 * Mark the page as not containing any LP_DEAD items --- see comments in
572 * _bt_delitems_vacuum().
574 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
575 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
577 PageSetLSN(page, lsn);
578 MarkBufferDirty(buffer);
579 UnlockReleaseBuffer(buffer);
583 * Get the latestRemovedXid from the heap pages pointed at by the index
584 * tuples being deleted. This puts the work for calculating latestRemovedXid
585 * into the recovery path rather than the primary path.
587 * It's possible that this generates a fair amount of I/O, since an index
588 * block may have hundreds of tuples being deleted. Repeat accesses to the
589 * same heap blocks are common, though are not yet optimised.
591 * XXX optimise later with something like XLogPrefetchBuffer()
594 btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec)
596 OffsetNumber *unused;
604 HeapTupleHeader htuphdr;
606 OffsetNumber hoffnum;
607 TransactionId latestRemovedXid = InvalidTransactionId;
611 * If there's nothing running on the standby we don't need to derive a
612 * full latestRemovedXid value, so use a fast path out of here. This
613 * returns InvalidTransactionId, and so will conflict with all HS
614 * transactions; but since we just worked out that that's zero people,
617 * XXX There is a race condition here, which is that a new backend might
618 * start just after we look. If so, it cannot need to conflict, but this
619 * coding will result in throwing a conflict anyway.
621 if (CountDBBackends(InvalidOid) == 0)
622 return latestRemovedXid;
625 * In what follows, we have to examine the previous state of the index
626 * page, as well as the heap page(s) it points to. This is only valid if
627 * WAL replay has reached a consistent database state; which means that
628 * the preceding check is not just an optimization, but is *necessary*. We
629 * won't have let in any user sessions before we reach consistency.
631 if (!reachedConsistency)
632 elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data");
635 * Get index page. If the DB is consistent, this should not fail, nor
636 * should any of the heap page fetches below. If one does, we return
637 * InvalidTransactionId to cancel all HS transactions. That's probably
638 * overkill, but it's safe, and certainly better than panicking here.
640 ibuffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
641 if (!BufferIsValid(ibuffer))
642 return InvalidTransactionId;
643 ipage = (Page) BufferGetPage(ibuffer);
646 * Loop through the deleted index items to obtain the TransactionId from
647 * the heap items they point to.
649 unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
651 for (i = 0; i < xlrec->nitems; i++)
654 * Identify the index tuple about to be deleted
656 iitemid = PageGetItemId(ipage, unused[i]);
657 itup = (IndexTuple) PageGetItem(ipage, iitemid);
660 * Locate the heap page that the index tuple points at
662 hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
663 hbuffer = XLogReadBuffer(xlrec->hnode, hblkno, false);
664 if (!BufferIsValid(hbuffer))
666 UnlockReleaseBuffer(ibuffer);
667 return InvalidTransactionId;
669 hpage = (Page) BufferGetPage(hbuffer);
672 * Look up the heap tuple header that the index tuple points at by
673 * using the heap node supplied with the xlrec. We can't use
674 * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
675 * Note that we are not looking at tuple data here, just headers.
677 hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
678 hitemid = PageGetItemId(hpage, hoffnum);
681 * Follow any redirections until we find something useful.
683 while (ItemIdIsRedirected(hitemid))
685 hoffnum = ItemIdGetRedirect(hitemid);
686 hitemid = PageGetItemId(hpage, hoffnum);
687 CHECK_FOR_INTERRUPTS();
691 * If the heap item has storage, then read the header and use that to
692 * set latestRemovedXid.
694 * Some LP_DEAD items may not be accessible, so we ignore them.
696 if (ItemIdHasStorage(hitemid))
698 htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
700 HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
702 else if (ItemIdIsDead(hitemid))
705 * Conjecture: if hitemid is dead then it had xids before the xids
706 * marked on LP_NORMAL items. So we just ignore this item and move
707 * onto the next, for the purposes of calculating
712 Assert(!ItemIdIsUsed(hitemid));
714 UnlockReleaseBuffer(hbuffer);
717 UnlockReleaseBuffer(ibuffer);
720 * If all heap tuples were LP_DEAD then we will be returning
721 * InvalidTransactionId here, which avoids conflicts. This matches
722 * existing logic which assumes that LP_DEAD tuples must already be older
723 * than the latestRemovedXid on the cleanup record that set them as
724 * LP_DEAD, hence must already have generated a conflict.
726 return latestRemovedXid;
730 btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
732 xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
738 * If we have any conflict processing to do, it must happen before we
741 * Btree delete records can conflict with standby queries. You might
742 * think that vacuum records would conflict as well, but we've handled
743 * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
744 * cleaned by the vacuum of the heap and so we can resolve any conflicts
745 * just once when that arrives. After that we know that no conflicts
746 * exist from individual btree vacuum records on that index.
750 TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(xlrec);
752 ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node);
755 /* If we have a full-page image, restore it and we're done */
756 if (record->xl_info & XLR_BKP_BLOCK(0))
758 (void) RestoreBackupBlock(lsn, record, 0, false, false);
763 * We don't need to take a cleanup lock to apply these changes. See
764 * nbtree/README for details.
766 buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
767 if (!BufferIsValid(buffer))
769 page = (Page) BufferGetPage(buffer);
771 if (lsn <= PageGetLSN(page))
773 UnlockReleaseBuffer(buffer);
777 if (record->xl_len > SizeOfBtreeDelete)
779 OffsetNumber *unused;
781 unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
783 PageIndexMultiDelete(page, unused, xlrec->nitems);
787 * Mark the page as not containing any LP_DEAD items --- see comments in
788 * _bt_delitems_delete().
790 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
791 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
793 PageSetLSN(page, lsn);
794 MarkBufferDirty(buffer);
795 UnlockReleaseBuffer(buffer);
799 btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record)
801 xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record);
806 IndexTupleData trunctuple;
808 parent = ItemPointerGetBlockNumber(&(xlrec->target.tid));
811 * In normal operation, we would lock all the pages this WAL record
812 * touches before changing any of them. In WAL replay, it should be okay
813 * to lock just one page at a time, since no concurrent index updates can
814 * be happening, and readers should not care whether they arrive at the
815 * target page or not (since it's surely empty).
819 if (record->xl_info & XLR_BKP_BLOCK(0))
820 (void) RestoreBackupBlock(lsn, record, 0, false, false);
823 buffer = XLogReadBuffer(xlrec->target.node, parent, false);
824 if (BufferIsValid(buffer))
826 page = (Page) BufferGetPage(buffer);
827 pageop = (BTPageOpaque) PageGetSpecialPointer(page);
828 if (lsn > PageGetLSN(page))
830 OffsetNumber poffset;
833 OffsetNumber nextoffset;
834 BlockNumber rightsib;
836 poffset = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
838 nextoffset = OffsetNumberNext(poffset);
839 itemid = PageGetItemId(page, nextoffset);
840 itup = (IndexTuple) PageGetItem(page, itemid);
841 rightsib = ItemPointerGetBlockNumber(&itup->t_tid);
843 itemid = PageGetItemId(page, poffset);
844 itup = (IndexTuple) PageGetItem(page, itemid);
845 ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
846 nextoffset = OffsetNumberNext(poffset);
847 PageIndexTupleDelete(page, nextoffset);
849 PageSetLSN(page, lsn);
850 MarkBufferDirty(buffer);
852 UnlockReleaseBuffer(buffer);
856 /* Rewrite the leaf page as a halfdead page */
857 buffer = XLogReadBuffer(xlrec->target.node, xlrec->leafblk, true);
858 Assert(BufferIsValid(buffer));
859 page = (Page) BufferGetPage(buffer);
861 _bt_pageinit(page, BufferGetPageSize(buffer));
862 pageop = (BTPageOpaque) PageGetSpecialPointer(page);
864 pageop->btpo_prev = xlrec->leftblk;
865 pageop->btpo_next = xlrec->rightblk;
866 pageop->btpo.level = 0;
867 pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
868 pageop->btpo_cycleid = 0;
871 * Construct a dummy hikey item that points to the next parent to be
874 MemSet(&trunctuple, 0, sizeof(IndexTupleData));
875 trunctuple.t_info = sizeof(IndexTupleData);
876 if (xlrec->topparent != InvalidBlockNumber)
877 ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
879 ItemPointerSetInvalid(&trunctuple.t_tid);
880 if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
881 false, false) == InvalidOffsetNumber)
882 elog(ERROR, "could not add dummy high key to half-dead page");
884 PageSetLSN(page, lsn);
885 MarkBufferDirty(buffer);
886 UnlockReleaseBuffer(buffer);
891 btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
893 xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
896 BlockNumber rightsib;
901 target = xlrec->deadblk;
902 leftsib = xlrec->leftsib;
903 rightsib = xlrec->rightsib;
906 * In normal operation, we would lock all the pages this WAL record
907 * touches before changing any of them. In WAL replay, it should be okay
908 * to lock just one page at a time, since no concurrent index updates can
909 * be happening, and readers should not care whether they arrive at the
910 * target page or not (since it's surely empty).
913 /* Fix left-link of right sibling */
914 if (record->xl_info & XLR_BKP_BLOCK(0))
915 (void) RestoreBackupBlock(lsn, record, 0, false, false);
918 buffer = XLogReadBuffer(xlrec->node, rightsib, false);
919 if (BufferIsValid(buffer))
921 page = (Page) BufferGetPage(buffer);
922 if (lsn <= PageGetLSN(page))
924 UnlockReleaseBuffer(buffer);
928 pageop = (BTPageOpaque) PageGetSpecialPointer(page);
929 pageop->btpo_prev = leftsib;
931 PageSetLSN(page, lsn);
932 MarkBufferDirty(buffer);
933 UnlockReleaseBuffer(buffer);
938 /* Fix right-link of left sibling, if any */
939 if (record->xl_info & XLR_BKP_BLOCK(1))
940 (void) RestoreBackupBlock(lsn, record, 1, false, false);
943 if (leftsib != P_NONE)
945 buffer = XLogReadBuffer(xlrec->node, leftsib, false);
946 if (BufferIsValid(buffer))
948 page = (Page) BufferGetPage(buffer);
949 if (lsn <= PageGetLSN(page))
951 UnlockReleaseBuffer(buffer);
955 pageop = (BTPageOpaque) PageGetSpecialPointer(page);
956 pageop->btpo_next = rightsib;
958 PageSetLSN(page, lsn);
959 MarkBufferDirty(buffer);
960 UnlockReleaseBuffer(buffer);
966 /* Rewrite target page as empty deleted page */
967 buffer = XLogReadBuffer(xlrec->node, target, true);
968 Assert(BufferIsValid(buffer));
969 page = (Page) BufferGetPage(buffer);
971 _bt_pageinit(page, BufferGetPageSize(buffer));
972 pageop = (BTPageOpaque) PageGetSpecialPointer(page);
974 pageop->btpo_prev = leftsib;
975 pageop->btpo_next = rightsib;
976 pageop->btpo.xact = xlrec->btpo_xact;
977 pageop->btpo_flags = BTP_DELETED;
978 pageop->btpo_cycleid = 0;
980 PageSetLSN(page, lsn);
981 MarkBufferDirty(buffer);
982 UnlockReleaseBuffer(buffer);
985 * If we deleted a parent of the targeted leaf page, instead of the leaf
986 * itself, update the leaf to point to the next remaining child in the
989 if (target != xlrec->leafblk)
992 * There is no real data on the page, so we just re-create it from
993 * scratch using the information from the WAL record.
995 IndexTupleData trunctuple;
997 buffer = XLogReadBuffer(xlrec->node, xlrec->leafblk, true);
998 Assert(BufferIsValid(buffer));
999 page = (Page) BufferGetPage(buffer);
1000 pageop = (BTPageOpaque) PageGetSpecialPointer(page);
1002 _bt_pageinit(page, BufferGetPageSize(buffer));
1003 pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
1004 pageop->btpo_prev = xlrec->leafleftsib;
1005 pageop->btpo_next = xlrec->leafrightsib;
1006 pageop->btpo.level = 0;
1007 pageop->btpo_cycleid = 0;
1009 /* Add a dummy hikey item */
1010 MemSet(&trunctuple, 0, sizeof(IndexTupleData));
1011 trunctuple.t_info = sizeof(IndexTupleData);
1012 if (xlrec->topparent != InvalidBlockNumber)
1013 ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
1015 ItemPointerSetInvalid(&trunctuple.t_tid);
1016 if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
1017 false, false) == InvalidOffsetNumber)
1018 elog(ERROR, "could not add dummy high key to half-dead page");
1020 PageSetLSN(page, lsn);
1021 MarkBufferDirty(buffer);
1022 UnlockReleaseBuffer(buffer);
1025 /* Update metapage if needed */
1026 if (info == XLOG_BTREE_UNLINK_PAGE_META)
1028 xl_btree_metadata md;
1030 memcpy(&md, (char *) xlrec + SizeOfBtreeUnlinkPage,
1031 sizeof(xl_btree_metadata));
1032 _bt_restore_meta(xlrec->node, lsn,
1034 md.fastroot, md.fastlevel);
1039 btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
1041 xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
1044 BTPageOpaque pageop;
1046 buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true);
1047 Assert(BufferIsValid(buffer));
1048 page = (Page) BufferGetPage(buffer);
1050 _bt_pageinit(page, BufferGetPageSize(buffer));
1051 pageop = (BTPageOpaque) PageGetSpecialPointer(page);
1053 pageop->btpo_flags = BTP_ROOT;
1054 pageop->btpo_prev = pageop->btpo_next = P_NONE;
1055 pageop->btpo.level = xlrec->level;
1056 if (xlrec->level == 0)
1057 pageop->btpo_flags |= BTP_LEAF;
1058 pageop->btpo_cycleid = 0;
1060 if (record->xl_len > SizeOfBtreeNewroot)
1065 _bt_restore_page(page,
1066 (char *) xlrec + SizeOfBtreeNewroot,
1067 record->xl_len - SizeOfBtreeNewroot);
1068 /* extract block number of the left-hand split page */
1069 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY));
1070 cblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1071 Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
1073 /* Clear the incomplete-split flag in left child */
1074 if (record->xl_info & XLR_BKP_BLOCK(0))
1075 (void) RestoreBackupBlock(lsn, record, 0, false, false);
1077 _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno);
1080 PageSetLSN(page, lsn);
1081 MarkBufferDirty(buffer);
1082 UnlockReleaseBuffer(buffer);
1084 _bt_restore_meta(xlrec->node, lsn,
1085 xlrec->rootblk, xlrec->level,
1086 xlrec->rootblk, xlrec->level);
1090 btree_xlog_reuse_page(XLogRecPtr lsn, XLogRecord *record)
1092 xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
1095 * Btree reuse_page records exist to provide a conflict point when we
1096 * reuse pages in the index via the FSM. That's all they do though.
1098 * latestRemovedXid was the page's btpo.xact. The btpo.xact <
1099 * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
1100 * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
1101 * Consequently, one XID value achieves the same exclusion effect on
1102 * master and standby.
1106 ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
1110 /* Backup blocks are not used in reuse_page records */
1111 Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
1116 btree_redo(XLogRecPtr lsn, XLogRecord *record)
1118 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1122 case XLOG_BTREE_INSERT_LEAF:
1123 btree_xlog_insert(true, false, lsn, record);
1125 case XLOG_BTREE_INSERT_UPPER:
1126 btree_xlog_insert(false, false, lsn, record);
1128 case XLOG_BTREE_INSERT_META:
1129 btree_xlog_insert(false, true, lsn, record);
1131 case XLOG_BTREE_SPLIT_L:
1132 btree_xlog_split(true, false, lsn, record);
1134 case XLOG_BTREE_SPLIT_R:
1135 btree_xlog_split(false, false, lsn, record);
1137 case XLOG_BTREE_SPLIT_L_ROOT:
1138 btree_xlog_split(true, true, lsn, record);
1140 case XLOG_BTREE_SPLIT_R_ROOT:
1141 btree_xlog_split(false, true, lsn, record);
1143 case XLOG_BTREE_VACUUM:
1144 btree_xlog_vacuum(lsn, record);
1146 case XLOG_BTREE_DELETE:
1147 btree_xlog_delete(lsn, record);
1149 case XLOG_BTREE_MARK_PAGE_HALFDEAD:
1150 btree_xlog_mark_page_halfdead(info, lsn, record);
1152 case XLOG_BTREE_UNLINK_PAGE:
1153 case XLOG_BTREE_UNLINK_PAGE_META:
1154 btree_xlog_unlink_page(info, lsn, record);
1156 case XLOG_BTREE_NEWROOT:
1157 btree_xlog_newroot(lsn, record);
1159 case XLOG_BTREE_REUSE_PAGE:
1160 btree_xlog_reuse_page(lsn, record);
1163 elog(PANIC, "btree_redo: unknown op code %u", info);