From: Tom Lane Date: Fri, 21 Jul 2000 22:14:09 +0000 (+0000) Subject: Further cleanup of btbuild (CREATE INDEX). Avoid storing unneeded X-Git-Tag: REL7_1_BETA~872 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=421f0baaffde574ba7d866169a0d617d1b529823;p=postgresql Further cleanup of btbuild (CREATE INDEX). Avoid storing unneeded left keys during bottom-up index build, and leave some free space instead of packing the pages to the brim (so as to avoid vast numbers of page splits during the first interactive insertions). --- diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 9ae596ab23..e771798dc5 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,4 +1,4 @@ -$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.2 2000/07/21 06:42:32 tgl Exp $ +$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.3 2000/07/21 22:14:09 tgl Exp $ This directory contains a correct implementation of Lehman and Yao's high-concurrency B-tree management algorithm (P. Lehman and S. Yao, @@ -168,8 +168,7 @@ Notes about data representation: Notes to operator class implementors: -+ With this implementation, we require the user to supply us with - a procedure for pg_amproc. This procedure should take two keys - A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B, - respectively. See the contents of that relation for the btree - access method for some samples. ++ With this implementation, we require each supported datatype to supply + us with a comparison procedure via pg_amproc. This procedure must take + two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B, + A = B, or A > B, respectively. See nbtcompare.c for examples. diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 1981f55469..40408e2b3b 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -13,8 +13,8 @@ * its parent level. When we have only one page on a level, it must be * the root -- it can be attached to the btree metapage and we are done. * - * this code is moderately slow (~10% slower) compared to the regular - * btree (insertion) build code on sorted or well-clustered data. on + * This code is moderately slow (~10% slower) compared to the regular + * btree (insertion) build code on sorted or well-clustered data. On * random data, however, the insertion build code is unusable -- the * difference on a 60MB heap is a factor of 15 because the random * probes into the btree thrash the buffer pool. (NOTE: the above @@ -22,25 +22,20 @@ * not very good external sort implementation that used to exist in * this module. tuplesort.c is almost certainly faster.) * - * this code currently packs the pages to 100% of capacity. this is - * not wise, since *any* insertion will cause splitting. filling to - * something like the standard 70% steady-state load factor for btrees - * would probably be better. - * - * Another limitation is that we currently load full copies of all keys - * into upper tree levels. The leftmost data key in each non-leaf node - * could be omitted as far as normal btree operations are concerned - * (see README for more info). However, because we build the tree from - * the bottom up, we need that data key to insert into the node's parent. - * This could be fixed by keeping a spare copy of the minimum key in the - * state stack, but I haven't time for that right now. + * It is not wise to pack the pages entirely full, since then *any* + * insertion would cause a split (and not only of the leaf page; the need + * for a split would cascade right up the tree). The steady-state load + * factor for btrees is usually estimated at 70%. We choose to pack leaf + * pages to 90% and upper pages to 70%. This gives us reasonable density + * (there aren't many upper pages if the keys are reasonable-size) without + * incurring a lot of cascading splits during early insertions. * * * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.55 2000/07/21 06:42:33 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.56 2000/07/21 22:14:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -51,14 +46,6 @@ #include "utils/tuplesort.h" -/* - * turn on debugging output. - * - * XXX this code just does a numeric printf of the index key, so it's - * only really useful for integer keys. - */ -/*#define FASTBUILD_DEBUG*/ - /* * Status record for spooling. */ @@ -72,13 +59,24 @@ struct BTSpool /* * Status record for a btree page being built. We have one of these * for each active tree level. + * + * The reason we need to store a copy of the minimum key is that we'll + * need to propagate it to the parent node when this page is linked + * into its parent. However, if the page is not a leaf page, the first + * entry on the page doesn't need to contain a key, so we will not have + * stored the key itself on the page. (You might think we could skip + * copying the minimum key on leaf pages, but actually we must have a + * writable copy anyway because we'll poke the page's address into it + * before passing it up to the parent...) */ typedef struct BTPageState { Buffer btps_buf; /* current buffer & page */ Page btps_page; + BTItem btps_minkey; /* copy of minimum key (first item) on page */ OffsetNumber btps_lastoff; /* last item offset loaded */ - int btps_level; + int btps_level; /* tree level (0 = leaf) */ + Size btps_full; /* "full" if less than this much free space */ struct BTPageState *btps_next; /* link to parent level, if any */ } BTPageState; @@ -90,12 +88,14 @@ typedef struct BTPageState 0) -static void _bt_load(Relation index, BTSpool *btspool); -static void _bt_buildadd(Relation index, BTPageState *state, - BTItem bti, int flags); -static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); +static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags); static BTPageState *_bt_pagestate(Relation index, int flags, int level); +static void _bt_slideleft(Relation index, Buffer buf, Page page); +static void _bt_sortaddtup(Page page, Size itemsize, + BTItem btitem, OffsetNumber itup_off); +static void _bt_buildadd(Relation index, BTPageState *state, BTItem bti); static void _bt_uppershutdown(Relation index, BTPageState *state); +static void _bt_load(Relation index, BTSpool *btspool); /* @@ -190,6 +190,35 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) opaque->btpo_flags = flags; } +/* + * allocate and initialize a new BTPageState. the returned structure + * is suitable for immediate use by _bt_buildadd. + */ +static BTPageState * +_bt_pagestate(Relation index, int flags, int level) +{ + BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); + + MemSet((char *) state, 0, sizeof(BTPageState)); + + /* create initial page */ + _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); + + state->btps_minkey = (BTItem) NULL; + /* initialize lastoff so first item goes into P_FIRSTKEY */ + state->btps_lastoff = P_HIKEY; + state->btps_level = level; + /* set "full" threshold based on level. See notes at head of file. */ + if (level > 0) + state->btps_full = (PageGetPageSize(state->btps_page) * 3) / 10; + else + state->btps_full = PageGetPageSize(state->btps_page) / 10; + /* no parent level, yet */ + state->btps_next = (BTPageState *) NULL; + + return state; +} + /* * slide an array of ItemIds back one slot (from P_FIRSTKEY to * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover @@ -219,53 +248,49 @@ _bt_slideleft(Relation index, Buffer buf, Page page) } /* - * allocate and initialize a new BTPageState. the returned structure - * is suitable for immediate use by _bt_buildadd. - */ -static BTPageState * -_bt_pagestate(Relation index, int flags, int level) -{ - BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); - - MemSet((char *) state, 0, sizeof(BTPageState)); - _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); - state->btps_lastoff = P_HIKEY; - state->btps_next = (BTPageState *) NULL; - state->btps_level = level; - - return state; -} - -/* - * return a copy of the minimum (P_HIKEY or P_FIRSTKEY) item on - * 'opage'. the copy is modified to point to 'opage' (as opposed to - * the page to which the item used to point, e.g., a heap page if - * 'opage' is a leaf page). + * Add an item to a page being built. + * + * The main difference between this routine and a bare PageAddItem call + * is that this code knows that the leftmost data item on a non-leaf + * btree page doesn't need to have a key. Therefore, it strips such + * items down to just the item header. + * + * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use + * that because it assumes that P_RIGHTMOST() will return the correct + * answer for the page. Here, we don't know yet if the page will be + * rightmost. Offset P_FIRSTKEY is always the first data key. */ -static BTItem -_bt_minitem(Page opage, BlockNumber oblkno, int atend) +static void +_bt_sortaddtup(Page page, + Size itemsize, + BTItem btitem, + OffsetNumber itup_off) { - OffsetNumber off; - BTItem obti; - BTItem nbti; + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + BTItemData truncitem; - off = atend ? P_HIKEY : P_FIRSTKEY; - obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off)); - nbti = _bt_formitem(&(obti->bti_itup)); - ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY); + if (! P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) + { + memcpy(&truncitem, btitem, sizeof(BTItemData)); + truncitem.bti_itup.t_info = sizeof(BTItemData); + btitem = &truncitem; + itemsize = sizeof(BTItemData); + } - return nbti; + if (PageAddItem(page, (Item) btitem, itemsize, itup_off, + LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the page in _bt_sort"); } -/* - * add an item to a disk page from the sort output. +/*---------- + * Add an item to a disk page from the sort output. * - * we must be careful to observe the following restrictions, placed - * upon us by the conventions in nbtsearch.c: - * - rightmost pages start data items at P_HIKEY instead of at - * P_FIRSTKEY. + * We must be careful to observe the page layout conventions of nbtsearch.c: + * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. + * - on non-leaf pages, the key portion of the first item need not be + * stored, we should store only the link. * - * a leaf page being built looks like: + * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | @@ -280,16 +305,18 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * - * contrast this with the diagram in bufpage.h; note the mismatch - * between linps and items. this is because we reserve linp0 as a + * Contrast this with the diagram in bufpage.h; note the mismatch + * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear - * linpN. + * linpN. On the other hand, if we find this is the last (rightmost) + * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. + *---------- */ static void -_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) +_bt_buildadd(Relation index, BTPageState *state, BTItem bti) { Buffer nbuf; Page npage; @@ -321,44 +348,34 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) btisz, (PageGetPageSize(npage) - sizeof(PageHeaderData) - MAXALIGN(sizeof(BTPageOpaqueData))) /3 - sizeof(ItemIdData)); - if (pgspc < btisz) + if (pgspc < btisz || pgspc < state->btps_full) { /* - * Item won't fit on this page, so finish off the page and - * write it out. + * Item won't fit on this page, or we feel the page is full enough + * already. Finish off the page and write it out. */ Buffer obuf = nbuf; Page opage = npage; ItemId ii; ItemId hii; - BTItem nbti; + BTItem obti; - _bt_blnewpage(index, &nbuf, &npage, flags); + /* Create new page */ + _bt_blnewpage(index, &nbuf, &npage, + (state->btps_level > 0) ? 0 : BTP_LEAF); /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high - * key rather than a true data item. - * - * note that since we always copy an item to the new page, - * 'bti' will never be the first data item on the new page. + * key rather than a true data item. There had better be at least + * two items on the page already, else the page would be empty of + * useful data. (Hence, we must allow pages to be packed at least + * 2/3rds full; the 70% figure used above is close to minimum.) */ + Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); - if (PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len, - P_FIRSTKEY, LP_USED) == InvalidOffsetNumber) - elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)"); -#ifdef FASTBUILD_DEBUG - { - bool isnull; - BTItem tmpbti = - (BTItem) PageGetItem(npage, PageGetItemId(npage, P_FIRSTKEY)); - Datum d = index_getattr(&(tmpbti->bti_itup), 1, - index->rd_att, &isnull); - - printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", - d, P_FIRSTKEY, state->btps_level); - } -#endif + obti = (BTItem) PageGetItem(opage, ii); + _bt_sortaddtup(npage, ItemIdGetLength(ii), obti, P_FIRSTKEY); /* * Move 'last' into the high key position on opage @@ -369,12 +386,39 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* - * Reset last_off to point to new page + * Link the old buffer into its parent, using its minimum key. + * If we don't have a parent, we have to create one; + * this adds a new btree level. */ - last_off = PageGetMaxOffsetNumber(npage); + if (state->btps_next == (BTPageState *) NULL) + { + state->btps_next = + _bt_pagestate(index, 0, state->btps_level + 1); + } + Assert(state->btps_minkey != NULL); + ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid), + BufferGetBlockNumber(obuf), P_HIKEY); + _bt_buildadd(index, state->btps_next, state->btps_minkey); + pfree((void *) state->btps_minkey); /* - * set the page (side link) pointers. + * Save a copy of the minimum key for the new page. We have to + * copy it off the old page, not the new one, in case we are + * not at leaf level. + */ + state->btps_minkey = _bt_formitem(&(obti->bti_itup)); + + /* + * Set the sibling links for both pages, and parent links too. + * + * It's not necessary to set the parent link at all, because it's + * only used for handling concurrent root splits, but we may as well + * do it as a debugging aid. Note we set new page's link as well + * as old's, because if the new page turns out to be the last of + * the level, _bt_uppershutdown won't change it. The links may be + * out of date by the time the build finishes, but that's OK; they + * need only point to a left-sibling of the true parent. See the + * README file for more info. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); @@ -383,46 +427,40 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) oopaque->btpo_next = BufferGetBlockNumber(nbuf); nopaque->btpo_prev = BufferGetBlockNumber(obuf); nopaque->btpo_next = P_NONE; + oopaque->btpo_parent = nopaque->btpo_parent = + BufferGetBlockNumber(state->btps_next->btps_buf); } /* - * Link the old buffer into its parent, using its minimum key. - * If we don't have a parent, we have to create one; - * this adds a new btree level. + * Write out the old page. We never want to see it again, so we + * can give up our lock (if we had one; most likely BuildingBtree + * is set, so we aren't locking). */ - if (state->btps_next == (BTPageState *) NULL) - { - state->btps_next = - _bt_pagestate(index, 0, state->btps_level + 1); - } - nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); - _bt_buildadd(index, state->btps_next, nbti, 0); - pfree((void *) nbti); + _bt_wrtbuf(index, obuf); /* - * write out the old stuff. we never want to see it again, so we - * can give up our lock (if we had one; BuildingBtree is set, so - * we aren't locking). + * Reset last_off to point to new page */ - _bt_wrtbuf(index, obuf); + last_off = P_FIRSTKEY; } /* - * Add the new item into the current page. + * If the new item is the first for its page, stash a copy for later. + * Note this will only happen for the first item on a level; on later + * pages, the first item for a page is copied from the prior page + * in the code above. */ - last_off = OffsetNumberNext(last_off); - if (PageAddItem(npage, (Item) bti, btisz, - last_off, LP_USED) == InvalidOffsetNumber) - elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)"); -#ifdef FASTBUILD_DEBUG + if (last_off == P_HIKEY) { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); - - printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", - d, last_off, state->btps_level); + Assert(state->btps_minkey == NULL); + state->btps_minkey = _bt_formitem(&(bti->bti_itup)); } -#endif + + /* + * Add the new item into the current page. + */ + last_off = OffsetNumberNext(last_off); + _bt_sortaddtup(npage, btisz, bti, last_off); state->btps_buf = nbuf; state->btps_page = npage; @@ -436,15 +474,15 @@ static void _bt_uppershutdown(Relation index, BTPageState *state) { BTPageState *s; - BlockNumber blkno; - BTPageOpaque opaque; - BTItem bti; /* * Each iteration of this loop completes one more level of the tree. */ for (s = state; s != (BTPageState *) NULL; s = s->btps_next) { + BlockNumber blkno; + BTPageOpaque opaque; + blkno = BufferGetBlockNumber(s->btps_buf); opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); @@ -463,9 +501,12 @@ _bt_uppershutdown(Relation index, BTPageState *state) } else { - bti = _bt_minitem(s->btps_page, blkno, 0); - _bt_buildadd(index, s->btps_next, bti, 0); - pfree((void *) bti); + Assert(s->btps_minkey != NULL); + ItemPointerSet(&(s->btps_minkey->bti_itup.t_tid), + blkno, P_HIKEY); + _bt_buildadd(index, s->btps_next, s->btps_minkey); + pfree((void *) s->btps_minkey); + s->btps_minkey = NULL; } /* @@ -500,11 +541,13 @@ _bt_load(Relation index, BTSpool *btspool) if (state == NULL) state = _bt_pagestate(index, BTP_LEAF, 0); - _bt_buildadd(index, state, bti, BTP_LEAF); + _bt_buildadd(index, state, bti); + if (should_free) pfree((void *) bti); } + /* Close down final pages, if we had any data at all */ if (state != NULL) _bt_uppershutdown(index, state); }