From 9c02cf56614366769682bb8b3f4e9eecf8f237c4 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 14 Aug 2019 11:32:35 -0700 Subject: [PATCH] Remove block number field from nbtree stack. The initial value of the nbtree stack downlink block number field recorded during an initial descent of the tree wasn't actually used. Both _bt_getstackbuf() callers overwrote the value with their own value. Remove the block number field from the stack struct, and add a child block number argument to _bt_getstackbuf() in its place. This makes the overall design of _bt_getstackbuf() clearer. Author: Peter Geoghegan Reviewed-By: Anastasia Lubennikova Discussion: https://postgr.es/m/CAH2-Wzmx+UbXt2YNOUCZ-a04VdXU=S=OHuAuD7Z8uQq-PXTYUg@mail.gmail.com --- src/backend/access/nbtree/README | 8 ++++- src/backend/access/nbtree/nbtinsert.c | 46 +++++++++++++++++---------- src/backend/access/nbtree/nbtpage.c | 3 +- src/backend/access/nbtree/nbtsearch.c | 19 ++++------- src/include/access/nbtree.h | 16 ++++------ 5 files changed, 50 insertions(+), 42 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index dce8bc98e9..c8bdb0c935 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -224,7 +224,13 @@ it, but it's still linked to its siblings. (Note: Lanin and Shasha prefer to make the key space move left, but their argument for doing so hinges on not having left-links, which we have -anyway. So we simplify the algorithm by moving key space right.) +anyway. So we simplify the algorithm by moving the key space right. Note +also that Lanin and Shasha optimistically avoid holding multiple locks as +the tree is ascended. They're willing to release all locks and retry in +"rare" cases where the correct location for a new downlink cannot be found +immediately. We prefer to stick with Lehman and Yao's approach of +pessimistically coupling buffer locks when ascending the tree, since it's +far simpler.) To preserve consistency on the parent level, we cannot merge the key space of a page into its right sibling unless the right sibling is a child of diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 5890f393f6..48d19be3aa 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1797,7 +1797,6 @@ _bt_insert_parent(Relation rel, stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; - stack->bts_btentry = InvalidBlockNumber; stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } @@ -1819,8 +1818,7 @@ _bt_insert_parent(Relation rel, * new downlink will be inserted at the correct offset. Even buf's * parent may have changed. */ - stack->bts_btentry = bknum; - pbuf = _bt_getstackbuf(rel, stack); + pbuf = _bt_getstackbuf(rel, stack, bknum); /* * Now we can unlock the right child. The left child will be unlocked @@ -1834,7 +1832,7 @@ _bt_insert_parent(Relation rel, errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u", RelationGetRelationName(rel), bknum, rbknum))); - /* Recursively update the parent */ + /* Recursively insert into the parent */ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, new_item, stack->bts_offset + 1, is_only); @@ -1901,21 +1899,37 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) } /* - * _bt_getstackbuf() -- Walk back up the tree one step, and find the item - * we last looked at in the parent. + * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot + * tuple whose downlink points to child page. * - * This is possible because we save the downlink from the parent item, - * which is enough to uniquely identify it. Insertions into the parent - * level could cause the item to move right; deletions could cause it - * to move left, but not left of the page we previously found it in. + * Caller passes child's block number, which is used to identify + * associated pivot tuple in parent page using a linear search that + * matches on pivot's downlink/block number. The expected location of + * the pivot tuple is taken from the stack one level above the child + * page. This is used as a starting point. Insertions into the + * parent level could cause the pivot tuple to move right; deletions + * could cause it to move left, but not left of the page we previously + * found it on. * - * Adjusts bts_blkno & bts_offset if changed. + * Caller can use its stack to relocate the pivot tuple/downlink for + * any same-level page to the right of the page found by its initial + * descent. This is necessary because of the possibility that caller + * moved right to recover from a concurrent page split. It's also + * convenient for certain callers to be able to step right when there + * wasn't a concurrent page split, while still using their original + * stack. For example, the checkingunique _bt_doinsert() case may + * have to step right when there are many physical duplicates, and its + * scantid forces an insertion to the right of the "first page the + * value could be on". * - * Returns write-locked buffer, or InvalidBuffer if item not found - * (should not happen). + * Returns write-locked parent page buffer, or InvalidBuffer if pivot + * tuple not found (should not happen). Adjusts bts_blkno & + * bts_offset if changed. Page split caller should insert its new + * pivot tuple for its new right sibling page on parent page, at the + * offset number bts_offset + 1. */ Buffer -_bt_getstackbuf(Relation rel, BTStack stack) +_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child) { BlockNumber blkno; OffsetNumber start; @@ -1977,7 +1991,7 @@ _bt_getstackbuf(Relation rel, BTStack stack) itemid = PageGetItemId(page, offnum); item = (IndexTuple) PageGetItem(page, itemid); - if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry) + if (BTreeInnerTupleGetDownLink(item) == child) { /* Return accurate pointer to where link is now */ stack->bts_blkno = blkno; @@ -1993,7 +2007,7 @@ _bt_getstackbuf(Relation rel, BTStack stack) itemid = PageGetItemId(page, offnum); item = (IndexTuple) PageGetItem(page, itemid); - if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry) + if (BTreeInnerTupleGetDownLink(item) == child) { /* Return accurate pointer to where link is now */ stack->bts_blkno = blkno; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 7c0e6d13a9..18c6de21c1 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1189,8 +1189,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, * non-unique high keys in leaf level pages. Even heapkeyspace indexes * can have a stale stack due to insertions into the parent. */ - stack->bts_btentry = child; - pbuf = _bt_getstackbuf(rel, stack); + pbuf = _bt_getstackbuf(rel, stack, child); if (pbuf == InvalidBuffer) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 19735bf733..7f77ed24c5 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -146,23 +146,16 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, par_blkno = BufferGetBlockNumber(*bufP); /* - * We need to save the location of the index entry we chose in the - * parent page on a stack. In case we split the tree, we'll use the - * stack to work back up to the parent page. We also save the actual - * downlink (block) to uniquely identify the index entry, in case it - * moves right while we're working lower in the tree. See the paper - * by Lehman and Yao for how this is detected and handled. (We use the - * child link during the second half of a page split -- if caller ends - * up splitting the child it usually ends up inserting a new pivot - * tuple for child's new right sibling immediately after the original - * bts_offset offset recorded here. The downlink block will be needed - * to check if bts_offset remains the position of this same pivot - * tuple.) + * We need to save the location of the pivot tuple we chose in the + * parent page on a stack. If we need to split a page, we'll use + * the stack to work back up to its parent page. If caller ends up + * splitting a page one level down, it usually ends up inserting a + * new pivot tuple/downlink immediately after the location recorded + * here. */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; new_stack->bts_offset = offnum; - new_stack->bts_btentry = blkno; new_stack->bts_parent = stack_in; /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 83e0e6c28e..7e54c456f7 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -403,20 +403,16 @@ typedef struct BTMetaPageData #define BT_WRITE BUFFER_LOCK_EXCLUSIVE /* - * BTStackData -- As we descend a tree, we push the (location, downlink) - * pairs from internal pages onto a private stack. If we split a - * leaf, we use this stack to walk back up the tree and insert data - * into parent pages (and possibly to split them, too). Lehman and - * Yao's update algorithm guarantees that under no circumstances can - * our private stack give us an irredeemably bad picture up the tree. - * Again, see the paper for details. + * BTStackData -- As we descend a tree, we push the location of pivot + * tuples whose downlink we are about to follow onto a private stack. If + * we split a leaf, we use this stack to walk back up the tree and insert + * data into its parent page at the correct location. We may also have to + * recursively split a grandparent of the leaf page (and so on). */ - typedef struct BTStackData { BlockNumber bts_blkno; OffsetNumber bts_offset; - BlockNumber bts_btentry; struct BTStackData *bts_parent; } BTStackData; @@ -731,7 +727,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); */ extern bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel); -extern Buffer _bt_getstackbuf(Relation rel, BTStack stack); +extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child); extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); /* -- 2.40.0