From d2086b08b023c0749a53d617ff3fe0f052646254 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 28 Jul 2018 00:31:40 +0300 Subject: [PATCH] Reduce path length for locking leaf B-tree pages during insertion In our B-tree implementation appropriate leaf page for new tuple insertion is acquired using _bt_search() function. This function always returns leaf page locked in shared mode. In order to obtain exclusive lock, caller have to relock the page. This commit makes _bt_search() function lock leaf page immediately in exclusive mode when needed. That removes unnecessary relock and, in turn reduces lock contention for B-tree leaf pages. Our experiments on multi-core systems showed acceleration up to 4.5 times in corner case. Discussion: https://postgr.es/m/CAPpHfduAMDFMNYTCN7VMBsFg_hsf0GqiqXnt%2BbSeaJworwFoig%40mail.gmail.com Author: Alexander Korotkov Reviewed-by: Yoshikazu Imai, Simon Riggs, Peter Geoghegan --- src/backend/access/nbtree/nbtinsert.c | 19 +++--------- src/backend/access/nbtree/nbtsearch.c | 44 ++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 4b2b4746f7..582e5b0652 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -216,23 +216,12 @@ top: if (!fastpath) { - /* find the first page containing this key */ - stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE, - NULL); - - /* trade in our read lock for a write lock */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBuffer(buf, BT_WRITE); - /* - * If the page was split between the time that we surrendered our read - * lock and acquired our write lock, then this page may no longer be - * the right place for the key we want to insert. In this case, we - * need to move right in the tree. See Lehman and Yao for an - * excruciatingly precise description. + * Find the first page containing this key. Buffer returned by + * _bt_search() is locked in exclusive mode. */ - buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, false, - true, stack, BT_WRITE, NULL); + stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE, + NULL); } /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 6831bc8c03..d3700bd082 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -87,17 +87,18 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * place during the descent through the tree. This is not needed when * positioning for an insert or delete, so NULL is used for those cases. * - * NOTE that the returned buffer is read-locked regardless of the access - * parameter. However, access = BT_WRITE will allow an empty root page - * to be created and returned. When access = BT_READ, an empty index - * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode, - * any incomplete splits encountered during the search will be finished. + * The returned buffer is locked according to access parameter. Additionally, + * access = BT_WRITE will allow an empty root page to be created and returned. + * When access = BT_READ, an empty index will result in *bufP being set to + * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered + * during the search will be finished. */ BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot) { BTStack stack_in = NULL; + int page_access = BT_READ; /* Get the root page to start with */ *bufP = _bt_getroot(rel, access); @@ -132,7 +133,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, (access == BT_WRITE), stack_in, - BT_READ, snapshot); + page_access, snapshot); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); @@ -166,13 +167,42 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, new_stack->bts_btentry = blkno; new_stack->bts_parent = stack_in; + /* + * Page level 1 is lowest non-leaf page level prior to leaves. So, + * if we're on the level 1 and asked to lock leaf page in write mode, + * then lock next page in write mode, because it must be a leaf. + */ + if (opaque->btpo.level == 1 && access == BT_WRITE) + page_access = BT_WRITE; + /* drop the read lock on the parent page, acquire one on the child */ - *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ); + *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access); /* okay, all set to move down a level */ stack_in = new_stack; } + /* + * If we're asked to lock leaf in write mode, but didn't manage to, then + * relock. That may happend when root page appears to be leaf. + */ + if (access == BT_WRITE && page_access == BT_READ) + { + /* trade in our read lock for a write lock */ + LockBuffer(*bufP, BUFFER_LOCK_UNLOCK); + LockBuffer(*bufP, BT_WRITE); + + /* + * If the page was split between the time that we surrendered our read + * lock and acquired our write lock, then this page may no longer be + * the right place for the key we want to insert. In this case, we + * need to move right in the tree. See Lehman and Yao for an + * excruciatingly precise description. + */ + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, + true, stack_in, BT_WRITE, snapshot); + } + return stack_in; } -- 2.40.0