*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.155 2007/03/25 19:45:14 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.156 2007/04/11 20:47:37 tgl Exp $
*
*-------------------------------------------------------------------------
*/
Relation heapRel, Buffer buf, OffsetNumber ioffset,
ScanKey itup_scankey);
static void _bt_findinsertloc(Relation rel,
- Buffer *bufptr,
+ Buffer *bufptr,
OffsetNumber *offsetptr,
int keysz,
ScanKey scankey,
OffsetNumber newitemoff,
Size newitemsz,
bool *newitemonleft);
-static void _bt_checksplitloc(FindSplitData *state,
+static void _bt_checksplitloc(FindSplitData *state,
OffsetNumber firstoldonright, bool newitemonleft,
int dataitemstoleft, Size firstoldonrightsz);
static void _bt_pgaddtup(Relation rel, Page page,
* the hint supplied by the caller invalid */
vacuumed = true;
- if (PageGetFreeSpace(page) >= itemsz)
+ if (PageGetFreeSpace(page) >= itemsz)
break; /* OK, now we have enough space */
}
* moved right at all, we know we should insert at the start of the
* page. If we didn't move right, we can use the firstlegaloff hint
* if the caller supplied one, unless we vacuumed the page which
- * might have moved tuples around making the hint invalid. If we
+ * might have moved tuples around making the hint invalid. If we
* didn't move right or can't use the hint, find the position
* by searching.
*/
Buffer sbuf = InvalidBuffer;
Page spage = NULL;
BTPageOpaque sopaque = NULL;
- OffsetNumber itup_off = 0;
- BlockNumber itup_blkno = 0;
Size itemsz;
ItemId itemid;
IndexTuple item;
_bt_pageinit(leftpage, BufferGetPageSize(buf));
/* rightpage was already initialized by _bt_getbuf */
+ /*
+ * Copy the original page's LSN and TLI into leftpage, which will become
+ * the updated version of the page. We need this because XLogInsert will
+ * examine these fields and possibly dump them in a page image.
+ */
+ PageSetLSN(leftpage, PageGetLSN(origpage));
+ PageSetTLI(leftpage, PageGetTLI(origpage));
+
/* init btree private data */
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
leftoff = OffsetNumberNext(leftoff);
/*
- * Now transfer all the data items to the appropriate page
+ * Now transfer all the data items to the appropriate page.
+ *
+ * Note: we *must* insert at least the right page's items in item-number
+ * order, for the benefit of _bt_restore_page().
*/
maxoff = PageGetMaxOffsetNumber(origpage);
{
_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
"left sibling");
- itup_off = leftoff;
- itup_blkno = BufferGetBlockNumber(buf);
leftoff = OffsetNumberNext(leftoff);
}
else
{
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
"right sibling");
- itup_off = rightoff;
- itup_blkno = BufferGetBlockNumber(rbuf);
rightoff = OffsetNumberNext(rightoff);
}
}
Assert(!newitemonleft);
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
"right sibling");
- itup_off = rightoff;
- itup_blkno = BufferGetBlockNumber(rbuf);
rightoff = OffsetNumberNext(rightoff);
}
/*
* Right sibling is locked, new siblings are prepared, but original page
- * is not updated yet. Log changes before continuing.
+ * is not updated yet.
*
* NO EREPORT(ERROR) till right sibling is updated. We can get away with
* not starting the critical section till here because we haven't been
*/
START_CRIT_SECTION();
- MarkBufferDirty(buf);
- MarkBufferDirty(rbuf);
-
- if (!P_RIGHTMOST(ropaque))
- {
- sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
- MarkBufferDirty(sbuf);
- }
-
/*
* By here, the original data page has been split into two new halves, and
* these are correct. The algorithm requires that the left page never
*/
PageRestoreTempPage(leftpage, origpage);
+ MarkBufferDirty(buf);
+ MarkBufferDirty(rbuf);
+
+ if (!P_RIGHTMOST(ropaque))
+ {
+ sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+ MarkBufferDirty(sbuf);
+ }
+
/* XLOG stuff */
if (!rel->rd_istemp)
{
xlrec.node = rel->rd_node;
xlrec.leftsib = BufferGetBlockNumber(buf);
xlrec.rightsib = BufferGetBlockNumber(rbuf);
- xlrec.firstright = firstright;
xlrec.rnext = ropaque->btpo_next;
xlrec.level = ropaque->btpo.level;
+ xlrec.firstright = firstright;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeSplit;
lastrdata->buffer = InvalidBuffer;
}
- /* Log the new item, if it was inserted on the left page. If it was
- * put on the right page, we don't need to explicitly WAL log it
- * because it's included with all the other items on the right page.
+ /*
+ * Log the new item and its offset, if it was inserted on the left
+ * page. (If it was put on the right page, we don't need to explicitly
+ * WAL log it because it's included with all the other items on the
+ * right page.) Show these as belonging to the left page buffer,
+ * so that they are not stored if XLogInsert decides it needs a
+ * full-page image of the left page.
*/
- lastrdata->next = lastrdata + 1;
- lastrdata++;
if (newitemonleft)
{
+ lastrdata->next = lastrdata + 1;
+ lastrdata++;
lastrdata->data = (char *) &newitemoff;
lastrdata->len = sizeof(OffsetNumber);
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->next = lastrdata + 1;
lastrdata++;
- lastrdata->data = (char *)newitem;
- lastrdata->len = newitemsz;
+ lastrdata->data = (char *) newitem;
+ lastrdata->len = MAXALIGN(newitemsz);
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
else
{
+ /*
+ * Although we don't need to WAL-log the new item, we still
+ * need XLogInsert to consider storing a full-page image of the
+ * left page, so make an empty entry referencing that buffer.
+ * This also ensures that the left page is always backup block 1.
+ */
+ lastrdata->next = lastrdata + 1;
+ lastrdata++;
lastrdata->data = NULL;
lastrdata->len = 0;
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
- /* Log the contents of the right page in the format understood by
+ /*
+ * Log the contents of the right page in the format understood by
* _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
- * because we're going to recreate the whole page anyway.
+ * because we're going to recreate the whole page anyway, so it
+ * should never be stored by XLogInsert.
*
* Direct access to page is not good but faster - we should implement
* some new func in page API. Note we only store the tuples
- * themselves, knowing that the item pointers are in the same order
- * and can be reconstructed by scanning the tuples. See comments for
+ * themselves, knowing that they were inserted in item-number order
+ * and so the item pointers can be reconstructed. See comments for
* _bt_restore_page().
*/
lastrdata->next = lastrdata + 1;
lastrdata++;
- lastrdata->data = (char *) rightpage +
+ lastrdata->data = (char *) rightpage +
((PageHeader) rightpage)->pd_upper;
lastrdata->len = ((PageHeader) rightpage)->pd_special -
((PageHeader) rightpage)->pd_upper;
lastrdata->buffer = InvalidBuffer;
- /* Log the right sibling, because we've changed it's prev-pointer. */
+ /* Log the right sibling, because we've changed its' prev-pointer. */
if (!P_RIGHTMOST(ropaque))
{
lastrdata->next = lastrdata + 1;
olddataitemstoleft = 0;
goodenoughfound = false;
maxoff = PageGetMaxOffsetNumber(page);
-
+
for (offnum = P_FIRSTDATAKEY(opaque);
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
olddataitemstoleft, itemsz);
else if (offnum < newitemoff)
- _bt_checksplitloc(&state, offnum, false,
+ _bt_checksplitloc(&state, offnum, false,
olddataitemstoleft, itemsz);
else
{
* items go to the left page and only the new item goes to the right page.
* In that case, firstoldonrightsz is not used.
*
- * olddataitemstoleft is the total size of all old items to the left of
- * firstoldonright.
+ * olddataitemstoleft is the total size of all old items to the left of
+ * firstoldonright.
*/
static void
-_bt_checksplitloc(FindSplitData *state,
+_bt_checksplitloc(FindSplitData *state,
OffsetNumber firstoldonright,
bool newitemonleft,
int olddataitemstoleft,
/* Account for all the old tuples */
leftfree = state->leftspace - olddataitemstoleft;
- rightfree = state->rightspace -
+ rightfree = state->rightspace -
(state->olddataitemstotal - olddataitemstoleft);
/*
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
- * Scan over all items to see which ones need to be deleted
+ * Scan over all items to see which ones need to be deleted
* according to LP_DELETE flags.
*/
minoff = P_FIRSTDATAKEY(opaque);
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.42 2007/02/08 05:05:53 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.43 2007/04/11 20:47:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
* in correct itemno sequence, but physically the opposite order from the
* original, because we insert them in the opposite of itemno order. This
* does not matter in any current btree code, but it's something to keep an
- * eye on. Is it worth changing just on general principles?
+ * eye on. Is it worth changing just on general principles? See also the
+ * notes in btree_xlog_split().
*/
static void
_bt_restore_page(Page page, char *from, int len)
{
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
Relation reln;
- Buffer lbuf, rbuf;
- Page lpage, rpage;
- BTPageOpaque ropaque, lopaque;
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque ropaque;
char *datapos;
int datalen;
- bool bkp_left = record->xl_info & XLR_BKP_BLOCK_1;
- bool bkp_nextsib = record->xl_info & XLR_BKP_BLOCK_2;
- OffsetNumber newitemoff;
+ OffsetNumber newitemoff = 0;
Item newitem = NULL;
Size newitemsz = 0;
/* Forget any split this insertion completes */
if (xlrec->level > 0)
{
+ /* we assume SizeOfBtreeSplit is at least 16-bit aligned */
BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos);
datapos += sizeof(BlockIdData);
forget_matching_split(xlrec->node, downlink, false);
}
-
- /* Extract newitem and newitemoff */
- if (!bkp_left && onleft)
+ /* Extract newitem and newitemoff, if present */
+ if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
{
IndexTupleData itupdata;
- /* Extract the offset of the new tuple and it's contents */
+ /* Extract the offset (still assuming 16-bit alignment) */
memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
datapos += sizeof(OffsetNumber);
datalen -= sizeof(OffsetNumber);
+ /*
+ * We need to copy the tuple header to apply IndexTupleDSize, because
+ * of alignment considerations. However, we assume that PageAddItem
+ * doesn't care about the alignment of the newitem pointer it's given.
+ */
newitem = datapos;
- /* Need to copy tuple header due to alignment considerations */
memcpy(&itupdata, datapos, sizeof(IndexTupleData));
newitemsz = IndexTupleDSize(itupdata);
newitemsz = MAXALIGN(newitemsz);
datalen -= newitemsz;
}
- /* Reconstruct right (new) sibling */
+ /* Reconstruct right (new) sibling from scratch */
rbuf = XLogReadBuffer(reln, xlrec->rightsib, true);
Assert(BufferIsValid(rbuf));
rpage = (Page) BufferGetPage(rbuf);
PageSetTLI(rpage, ThisTimeLineID);
MarkBufferDirty(rbuf);
- /* don't release the buffer yet, because reconstructing the left sibling
- * needs to access the data on the right page
- */
-
-
- /* Reconstruct left (original) sibling */
+ /* don't release the buffer yet; we touch right page's first item below */
- if(!bkp_left)
+ /*
+ * Reconstruct left (original) sibling if needed. Note that this code
+ * ensures that the items remaining on the left page are in the correct
+ * item number order, but it does not reproduce the physical order they
+ * would have had. Is this worth changing? See also _bt_restore_page().
+ */
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
- lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
+ Buffer lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
if (BufferIsValid(lbuf))
{
- lpage = (Page) BufferGetPage(lbuf);
- lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ Page lpage = (Page) BufferGetPage(lbuf);
+ BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
if (!XLByteLE(lsn, PageGetLSN(lpage)))
{
- /* Remove the items from the left page that were copied to
- * right page, and add the new item if it was inserted to
- * left page.
- */
OffsetNumber off;
OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
+ OffsetNumber deletable[MaxOffsetNumber];
+ int ndeletable = 0;
ItemId hiItemId;
Item hiItem;
- for(off = maxoff ; off >= xlrec->firstright; off--)
- PageIndexTupleDelete(lpage, off);
+ /*
+ * Remove the items from the left page that were copied to
+ * the right page. Also remove the old high key, if any.
+ * (We must remove everything before trying to insert any
+ * items, else we risk not having enough space.)
+ */
+ if (!P_RIGHTMOST(lopaque))
+ {
+ deletable[ndeletable++] = P_HIKEY;
+ /*
+ * newitemoff is given to us relative to the original
+ * page's item numbering, so adjust it for this deletion.
+ */
+ newitemoff--;
+ }
+ for (off = xlrec->firstright; off <= maxoff; off++)
+ deletable[ndeletable++] = off;
+ if (ndeletable > 0)
+ PageIndexMultiDelete(lpage, deletable, ndeletable);
+ /*
+ * Add the new item if it was inserted on left page.
+ */
if (onleft)
{
- if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
+ if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
LP_USED) == InvalidOffsetNumber)
- elog(PANIC, "can't add new item to left sibling after split");
+ elog(PANIC, "failed to add new item to left page after split");
}
+
/* Set high key equal to the first key on the right page */
hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
hiItem = PageGetItem(rpage, hiItemId);
- if(!P_RIGHTMOST(lopaque))
- {
- /* but remove the old high key first */
- PageIndexTupleDelete(lpage, P_HIKEY);
- }
-
- if(PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
- P_HIKEY, LP_USED) == InvalidOffsetNumber)
- elog(PANIC, "can't add high key after split to left page");
+ if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
+ P_HIKEY, LP_USED) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add high key to left page after split");
- /* Fix opaque fields */
+ /* Fix opaque fields */
lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
lopaque->btpo_next = xlrec->rightsib;
lopaque->btpo_cycleid = 0;
UnlockReleaseBuffer(lbuf);
}
-
}
- /* we no longer need the right buffer. */
+ /* We no longer need the right buffer */
UnlockReleaseBuffer(rbuf);
/* Fix left-link of the page to the right of the new right sibling */
- if (!bkp_nextsib && xlrec->rnext != P_NONE)
+ if (xlrec->rnext != P_NONE && !(record->xl_info & XLR_BKP_BLOCK_2))
{
Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false);
+
if (BufferIsValid(buffer))
{
Page page = (Page) BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
{
BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
pageop->btpo_prev = xlrec->rightsib;
PageSetLSN(page, lsn);
{
xl_btree_split *xlrec = (xl_btree_split *) rec;
- appendStringInfo(buf, "split_l: rel %u/%u/%u ",
+ appendStringInfo(buf, "split_l: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode);
- appendStringInfo(buf, "left %u, right %u off %u level %u",
- xlrec->leftsib, xlrec->rightsib,
- xlrec->firstright, xlrec->level);
+ appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+ xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+ xlrec->level, xlrec->firstright);
break;
}
case XLOG_BTREE_SPLIT_R:
{
xl_btree_split *xlrec = (xl_btree_split *) rec;
- appendStringInfo(buf, "split_r: rel %u/%u/%u ",
+ appendStringInfo(buf, "split_r: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode);
- appendStringInfo(buf, "left %u, right %u off %u level %u",
- xlrec->leftsib, xlrec->rightsib,
- xlrec->firstright, xlrec->level);
+ appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+ xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+ xlrec->level, xlrec->firstright);
break;
}
case XLOG_BTREE_SPLIT_L_ROOT:
{
xl_btree_split *xlrec = (xl_btree_split *) rec;
- appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",
+ appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode);
- appendStringInfo(buf, "left %u, right %u off %u level %u",
- xlrec->leftsib, xlrec->rightsib,
- xlrec->firstright, xlrec->level);
+ appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+ xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+ xlrec->level, xlrec->firstright);
break;
}
case XLOG_BTREE_SPLIT_R_ROOT:
{
xl_btree_split *xlrec = (xl_btree_split *) rec;
- appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",
+ appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode);
- appendStringInfo(buf, "left %u, right %u off %u level %u",
- xlrec->leftsib, xlrec->rightsib,
- xlrec->firstright, xlrec->level);
+ appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+ xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+ xlrec->level, xlrec->firstright);
break;
}
case XLOG_BTREE_DELETE:
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.112 2007/04/09 22:04:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.113 2007/04/11 20:47:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
/*
- * On insert with split we save items of both left and right siblings
- * and restore content of both pages from log record. This way takes less
- * xlog space than the normal approach, because if we did it standardly,
+ * On insert with split, we save all the items going into the right sibling
+ * so that we can restore it completely from the log record. This way takes
+ * less xlog space than the normal approach, because if we did it standardly,
* XLogInsert would almost always think the right page is new and store its
- * whole page image.
+ * whole page image. The left page, however, is handled in the normal
+ * incremental-update fashion.
*
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
* The _L and _R variants indicate whether the inserted tuple went into the
* left or right split page (and thus, whether newitemoff and the new item
- * are stored or not.
- * page of the split pair). The _ROOT variants indicate that we are splitting
+ * are stored or not). The _ROOT variants indicate that we are splitting
* the root page, and thus that a newroot record rather than an insert or
* split record should follow. Note that a split record never carries a
* metapage update --- we'll do that in the parent-level update.
typedef struct xl_btree_split
{
RelFileNode node;
- BlockNumber leftsib; /* orig page / new left page */
- BlockNumber rightsib; /* new right page */
- OffsetNumber firstright; /* first item stored on right page */
- BlockNumber rnext; /* next/right block pointer */
- uint32 level; /* tree level of page being split */
-
- /* BlockIdData downlink follows if level > 0 */
-
- /* OffsetNumber newitemoff follows in the _L variants. */
- /* New item follows in the _L variants */
- /* RIGHT PAGES TUPLES FOLLOW AT THE END */
+ BlockNumber leftsib; /* orig page / new left page */
+ BlockNumber rightsib; /* new right page */
+ BlockNumber rnext; /* next block (orig page's rightlink) */
+ uint32 level; /* tree level of page being split */
+ OffsetNumber firstright; /* first item moved to right page */
+
+ /*
+ * If level > 0, BlockIdData downlink follows. (We use BlockIdData
+ * rather than BlockNumber for alignment reasons: SizeOfBtreeSplit
+ * is only 16-bit aligned.)
+ *
+ * In the _L variants, next are OffsetNumber newitemoff and the new item.
+ * (In the _R variants, the new item is one of the right page's tuples.)
+ *
+ * Last are the right page's tuples in the form used by _bt_restore_page.
+ */
} xl_btree_split;
-#define SizeOfBtreeSplit (offsetof(xl_btree_split, level) + sizeof(uint32))
+#define SizeOfBtreeSplit (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
/*
* This is what we need to know about delete of individual leaf index tuples.
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.402 2007/04/09 22:04:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.403 2007/04/11 20:47:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 200704091
+#define CATALOG_VERSION_NO 200704111
#endif