]> granicus.if.org Git - postgresql/commitdiff
Code review for btree page split WAL reduction patch. Make it actually work
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 11 Apr 2007 20:47:38 +0000 (20:47 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 11 Apr 2007 20:47:38 +0000 (20:47 +0000)
(original code *always* created a full-page image for the left page, thus
leaving the intended savings unrealized), avoid risk of not having enough room
on the page during xlog restore, squeeze out another couple bytes in the xlog
record, clean up neglected comments.

src/backend/access/nbtree/nbtinsert.c
src/backend/access/nbtree/nbtxlog.c
src/include/access/nbtree.h
src/include/catalog/catversion.h

index c1671ce333c766b49c91b732f61d29f9f83eecff..775eaca2427e48591feebf311ce43b46e4814790 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.155 2007/03/25 19:45:14 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.156 2007/04/11 20:47:37 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -49,7 +49,7 @@ static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
                                 Relation heapRel, Buffer buf, OffsetNumber ioffset,
                                 ScanKey itup_scankey);
 static void _bt_findinsertloc(Relation rel,
-                                 Buffer *bufptr, 
+                                 Buffer *bufptr,
                                  OffsetNumber *offsetptr,
                                  int keysz,
                                  ScanKey scankey,
@@ -66,7 +66,7 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
                                 OffsetNumber newitemoff,
                                 Size newitemsz,
                                 bool *newitemonleft);
-static void _bt_checksplitloc(FindSplitData *state, 
+static void _bt_checksplitloc(FindSplitData *state,
                                  OffsetNumber firstoldonright, bool newitemonleft,
                                  int dataitemstoleft, Size firstoldonrightsz);
 static void _bt_pgaddtup(Relation rel, Page page,
@@ -459,7 +459,7 @@ _bt_findinsertloc(Relation rel,
                         * the hint supplied by the caller invalid */
                        vacuumed = true;
 
-                       if (PageGetFreeSpace(page) >= itemsz) 
+                       if (PageGetFreeSpace(page) >= itemsz)
                                break;          /* OK, now we have enough space */
                }
 
@@ -506,7 +506,7 @@ _bt_findinsertloc(Relation rel,
         * moved right at all, we know we should insert at the start of the
         * page. If we didn't move right, we can use the firstlegaloff hint
         * if the caller supplied one, unless we vacuumed the page which
-        * might have moved tuples around making the hint invalid. If we 
+        * might have moved tuples around making the hint invalid. If we
         * didn't move right or can't use the hint, find the position
         * by searching.
         */
@@ -779,8 +779,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
        Buffer          sbuf = InvalidBuffer;
        Page            spage = NULL;
        BTPageOpaque sopaque = NULL;
-       OffsetNumber itup_off = 0;
-       BlockNumber itup_blkno = 0;
        Size            itemsz;
        ItemId          itemid;
        IndexTuple      item;
@@ -798,6 +796,14 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
        _bt_pageinit(leftpage, BufferGetPageSize(buf));
        /* rightpage was already initialized by _bt_getbuf */
 
+       /*
+        * Copy the original page's LSN and TLI into leftpage, which will become
+        * the updated version of the page.  We need this because XLogInsert will
+        * examine these fields and possibly dump them in a page image.
+        */
+       PageSetLSN(leftpage, PageGetLSN(origpage));
+       PageSetTLI(leftpage, PageGetTLI(origpage));
+
        /* init btree private data */
        oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
        lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
@@ -864,7 +870,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
        leftoff = OffsetNumberNext(leftoff);
 
        /*
-        * Now transfer all the data items to the appropriate page
+        * Now transfer all the data items to the appropriate page.
+        *
+        * Note: we *must* insert at least the right page's items in item-number
+        * order, for the benefit of _bt_restore_page().
         */
        maxoff = PageGetMaxOffsetNumber(origpage);
 
@@ -881,16 +890,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
                        {
                                _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
                                                         "left sibling");
-                               itup_off = leftoff;
-                               itup_blkno = BufferGetBlockNumber(buf);
                                leftoff = OffsetNumberNext(leftoff);
                        }
                        else
                        {
                                _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
                                                         "right sibling");
-                               itup_off = rightoff;
-                               itup_blkno = BufferGetBlockNumber(rbuf);
                                rightoff = OffsetNumberNext(rightoff);
                        }
                }
@@ -921,8 +926,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
                Assert(!newitemonleft);
                _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
                                         "right sibling");
-               itup_off = rightoff;
-               itup_blkno = BufferGetBlockNumber(rbuf);
                rightoff = OffsetNumberNext(rightoff);
        }
 
@@ -961,7 +964,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 
        /*
         * Right sibling is locked, new siblings are prepared, but original page
-        * is not updated yet. Log changes before continuing.
+        * is not updated yet.
         *
         * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
         * not starting the critical section till here because we haven't been
@@ -970,15 +973,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
         */
        START_CRIT_SECTION();
 
-       MarkBufferDirty(buf);
-       MarkBufferDirty(rbuf);
-
-       if (!P_RIGHTMOST(ropaque))
-       {
-               sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
-               MarkBufferDirty(sbuf);
-       }
-
        /*
         * By here, the original data page has been split into two new halves, and
         * these are correct.  The algorithm requires that the left page never
@@ -994,6 +988,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
         */
        PageRestoreTempPage(leftpage, origpage);
 
+       MarkBufferDirty(buf);
+       MarkBufferDirty(rbuf);
+
+       if (!P_RIGHTMOST(ropaque))
+       {
+               sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+               MarkBufferDirty(sbuf);
+       }
+
        /* XLOG stuff */
        if (!rel->rd_istemp)
        {
@@ -1006,9 +1009,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
                xlrec.node = rel->rd_node;
                xlrec.leftsib = BufferGetBlockNumber(buf);
                xlrec.rightsib = BufferGetBlockNumber(rbuf);
-               xlrec.firstright = firstright;
                xlrec.rnext = ropaque->btpo_next;
                xlrec.level = ropaque->btpo.level;
+               xlrec.firstright = firstright;
 
                rdata[0].data = (char *) &xlrec;
                rdata[0].len = SizeOfBtreeSplit;
@@ -1027,14 +1030,18 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
                        lastrdata->buffer = InvalidBuffer;
                }
 
-               /* Log the new item, if it was inserted on the left page. If it was 
-                * put on the right page, we don't need to explicitly WAL log it 
-                * because it's included with all the other items on the right page.
+               /*
+                * Log the new item and its offset, if it was inserted on the left
+                * page. (If it was put on the right page, we don't need to explicitly
+                * WAL log it because it's included with all the other items on the
+                * right page.) Show these as belonging to the left page buffer,
+                * so that they are not stored if XLogInsert decides it needs a
+                * full-page image of the left page.
                 */
-               lastrdata->next = lastrdata + 1;
-               lastrdata++;
                if (newitemonleft)
                {
+                       lastrdata->next = lastrdata + 1;
+                       lastrdata++;
                        lastrdata->data = (char *) &newitemoff;
                        lastrdata->len = sizeof(OffsetNumber);
                        lastrdata->buffer = buf;                /* backup block 1 */
@@ -1042,39 +1049,49 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 
                        lastrdata->next = lastrdata + 1;
                        lastrdata++;
-                       lastrdata->data = (char *)newitem;
-                       lastrdata->len = newitemsz;
+                       lastrdata->data = (char *) newitem;
+                       lastrdata->len = MAXALIGN(newitemsz);
                        lastrdata->buffer = buf;                /* backup block 1 */
                        lastrdata->buffer_std = true;
                }
                else
                {
+                       /*
+                        * Although we don't need to WAL-log the new item, we still
+                        * need XLogInsert to consider storing a full-page image of the
+                        * left page, so make an empty entry referencing that buffer.
+                        * This also ensures that the left page is always backup block 1.
+                        */
+                       lastrdata->next = lastrdata + 1;
+                       lastrdata++;
                        lastrdata->data = NULL;
                        lastrdata->len = 0;
                        lastrdata->buffer = buf;                /* backup block 1 */
                        lastrdata->buffer_std = true;
                }
 
-               /* Log the contents of the right page in the format understood by
+               /*
+                * Log the contents of the right page in the format understood by
                 * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
-                * because we're going to recreate the whole page anyway.
+                * because we're going to recreate the whole page anyway, so it
+                * should never be stored by XLogInsert.
                 *
                 * Direct access to page is not good but faster - we should implement
                 * some new func in page API.  Note we only store the tuples
-                * themselves, knowing that the item pointers are in the same order
-                * and can be reconstructed by scanning the tuples.  See comments for
+                * themselves, knowing that they were inserted in item-number order
+                * and so the item pointers can be reconstructed.  See comments for
                 * _bt_restore_page().
                 */
                lastrdata->next = lastrdata + 1;
                lastrdata++;
 
-               lastrdata->data = (char *) rightpage + 
+               lastrdata->data = (char *) rightpage +
                        ((PageHeader) rightpage)->pd_upper;
                lastrdata->len = ((PageHeader) rightpage)->pd_special -
                        ((PageHeader) rightpage)->pd_upper;
                lastrdata->buffer = InvalidBuffer;
 
-               /* Log the right sibling, because we've changed it's prev-pointer. */
+               /* Log the right sibling, because we've changed its' prev-pointer. */
                if (!P_RIGHTMOST(ropaque))
                {
                        lastrdata->next = lastrdata + 1;
@@ -1216,7 +1233,7 @@ _bt_findsplitloc(Relation rel,
        olddataitemstoleft = 0;
        goodenoughfound = false;
        maxoff = PageGetMaxOffsetNumber(page);
-       
+
        for (offnum = P_FIRSTDATAKEY(opaque);
                 offnum <= maxoff;
                 offnum = OffsetNumberNext(offnum))
@@ -1234,7 +1251,7 @@ _bt_findsplitloc(Relation rel,
                                                          olddataitemstoleft, itemsz);
 
                else if (offnum < newitemoff)
-                       _bt_checksplitloc(&state, offnum, false, 
+                       _bt_checksplitloc(&state, offnum, false,
                                                          olddataitemstoleft, itemsz);
                else
                {
@@ -1285,11 +1302,11 @@ _bt_findsplitloc(Relation rel,
  * items go to the left page and only the new item goes to the right page.
  * In that case, firstoldonrightsz is not used.
  *
- * olddataitemstoleft is the total size of all old items to the left of 
- * firstoldonright. 
+ * olddataitemstoleft is the total size of all old items to the left of
+ * firstoldonright.
  */
 static void
-_bt_checksplitloc(FindSplitData *state, 
+_bt_checksplitloc(FindSplitData *state,
                                  OffsetNumber firstoldonright,
                                  bool newitemonleft,
                                  int olddataitemstoleft,
@@ -1311,7 +1328,7 @@ _bt_checksplitloc(FindSplitData *state,
 
        /* Account for all the old tuples */
        leftfree = state->leftspace - olddataitemstoleft;
-       rightfree = state->rightspace - 
+       rightfree = state->rightspace -
                (state->olddataitemstotal - olddataitemstoleft);
 
        /*
@@ -1854,7 +1871,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
        BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
        /*
-        * Scan over all items to see which ones need to be deleted 
+        * Scan over all items to see which ones need to be deleted
         * according to LP_DELETE flags.
         */
        minoff = P_FIRSTDATAKEY(opaque);
index dd6fd8571accb5dbdd4c65c92dd713c6e0f63664..ff41be37679523a343aeeca392e8e89e0a047055 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.42 2007/02/08 05:05:53 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.43 2007/04/11 20:47:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -125,7 +125,8 @@ forget_matching_deletion(RelFileNode node, BlockNumber delblk)
  * in correct itemno sequence, but physically the opposite order from the
  * original, because we insert them in the opposite of itemno order.  This
  * does not matter in any current btree code, but it's something to keep an
- * eye on.     Is it worth changing just on general principles?
+ * eye on.     Is it worth changing just on general principles?  See also the
+ * notes in btree_xlog_split().
  */
 static void
 _bt_restore_page(Page page, char *from, int len)
@@ -264,14 +265,12 @@ btree_xlog_split(bool onleft, bool isroot,
 {
        xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
        Relation        reln;
-       Buffer          lbuf, rbuf;
-       Page            lpage, rpage;
-       BTPageOpaque ropaque, lopaque;
+       Buffer          rbuf;
+       Page            rpage;
+       BTPageOpaque ropaque;
        char       *datapos;
        int                     datalen;
-       bool            bkp_left = record->xl_info & XLR_BKP_BLOCK_1;
-       bool            bkp_nextsib = record->xl_info & XLR_BKP_BLOCK_2;
-       OffsetNumber newitemoff;
+       OffsetNumber newitemoff = 0;
        Item newitem = NULL;
        Size newitemsz = 0;
 
@@ -283,6 +282,7 @@ btree_xlog_split(bool onleft, bool isroot,
        /* Forget any split this insertion completes */
        if (xlrec->level > 0)
        {
+               /* we assume SizeOfBtreeSplit is at least 16-bit aligned */
                BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos);
 
                datapos += sizeof(BlockIdData);
@@ -291,19 +291,22 @@ btree_xlog_split(bool onleft, bool isroot,
                forget_matching_split(xlrec->node, downlink, false);
        }
 
-
-       /* Extract newitem and newitemoff */
-       if (!bkp_left && onleft)
+       /* Extract newitem and newitemoff, if present */
+       if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
        {
                IndexTupleData itupdata;
 
-               /* Extract the offset of the new tuple and it's contents */
+               /* Extract the offset (still assuming 16-bit alignment) */
                memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
                datapos += sizeof(OffsetNumber);
                datalen -= sizeof(OffsetNumber);
 
+               /*
+                * We need to copy the tuple header to apply IndexTupleDSize, because
+                * of alignment considerations.  However, we assume that PageAddItem
+                * doesn't care about the alignment of the newitem pointer it's given.
+                */
                newitem = datapos;
-               /* Need to copy tuple header due to alignment considerations */
                memcpy(&itupdata, datapos, sizeof(IndexTupleData));
                newitemsz = IndexTupleDSize(itupdata);
                newitemsz = MAXALIGN(newitemsz);
@@ -311,7 +314,7 @@ btree_xlog_split(bool onleft, bool isroot,
                datalen -= newitemsz;
        }
 
-       /* Reconstruct right (new) sibling */
+       /* Reconstruct right (new) sibling from scratch */
        rbuf = XLogReadBuffer(reln, xlrec->rightsib, true);
        Assert(BufferIsValid(rbuf));
        rpage = (Page) BufferGetPage(rbuf);
@@ -331,57 +334,71 @@ btree_xlog_split(bool onleft, bool isroot,
        PageSetTLI(rpage, ThisTimeLineID);
        MarkBufferDirty(rbuf);
 
-       /* don't release the buffer yet, because reconstructing the left sibling
-        * needs to access the data on the right page 
-        */
-
-
-       /* Reconstruct left (original) sibling */
+       /* don't release the buffer yet; we touch right page's first item below */
 
-       if(!bkp_left)
+       /*
+        * Reconstruct left (original) sibling if needed.  Note that this code
+        * ensures that the items remaining on the left page are in the correct
+        * item number order, but it does not reproduce the physical order they
+        * would have had.  Is this worth changing?  See also _bt_restore_page().
+        */
+       if (!(record->xl_info & XLR_BKP_BLOCK_1))
        {
-               lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
+               Buffer lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
 
                if (BufferIsValid(lbuf))
                {
-                       lpage = (Page) BufferGetPage(lbuf);
-                       lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+                       Page lpage = (Page) BufferGetPage(lbuf);
+                       BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
 
                        if (!XLByteLE(lsn, PageGetLSN(lpage)))
                        {
-                               /* Remove the items from the left page that were copied to
-                                * right page, and add the new item if it was inserted to
-                                * left page.
-                                */
                                OffsetNumber off;
                                OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
+                               OffsetNumber deletable[MaxOffsetNumber];
+                               int ndeletable = 0;
                                ItemId hiItemId;
                                Item hiItem;
 
-                               for(off = maxoff ; off >= xlrec->firstright; off--)
-                                       PageIndexTupleDelete(lpage, off);
+                               /*
+                                * Remove the items from the left page that were copied to
+                                * the right page.  Also remove the old high key, if any.
+                                * (We must remove everything before trying to insert any
+                                * items, else we risk not having enough space.)
+                                */
+                               if (!P_RIGHTMOST(lopaque))
+                               {
+                                       deletable[ndeletable++] = P_HIKEY;
+                                       /*
+                                        * newitemoff is given to us relative to the original
+                                        * page's item numbering, so adjust it for this deletion.
+                                        */
+                                       newitemoff--;
+                               }
+                               for (off = xlrec->firstright; off <= maxoff; off++)
+                                       deletable[ndeletable++] = off;
+                               if (ndeletable > 0)
+                                       PageIndexMultiDelete(lpage, deletable, ndeletable);
 
+                               /*
+                                * Add the new item if it was inserted on left page.
+                                */
                                if (onleft)
                                {
-                                       if (PageAddItem(lpage, newitem, newitemsz, newitemoff, 
+                                       if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
                                                                        LP_USED) == InvalidOffsetNumber)
-                                               elog(PANIC, "can't add new item to left sibling after split");
+                                               elog(PANIC, "failed to add new item to left page after split");
                                }
+
                                /* Set high key equal to the first key on the right page */
                                hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
                                hiItem = PageGetItem(rpage, hiItemId);
 
-                               if(!P_RIGHTMOST(lopaque))
-                               {
-                                       /* but remove the old high key first */
-                                       PageIndexTupleDelete(lpage, P_HIKEY);
-                               }
-
-                               if(PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
-                                                          P_HIKEY, LP_USED) == InvalidOffsetNumber)
-                                       elog(PANIC, "can't add high key after split to left page");
+                               if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
+                                                               P_HIKEY, LP_USED) == InvalidOffsetNumber)
+                                       elog(PANIC, "failed to add high key to left page after split");
 
-                               /* Fix opaque fields */ 
+                               /* Fix opaque fields */
                                lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
                                lopaque->btpo_next = xlrec->rightsib;
                                lopaque->btpo_cycleid = 0;
@@ -393,16 +410,16 @@ btree_xlog_split(bool onleft, bool isroot,
 
                        UnlockReleaseBuffer(lbuf);
                }
-
        }
 
-       /* we no longer need the right buffer. */
+       /* We no longer need the right buffer */
        UnlockReleaseBuffer(rbuf);
 
        /* Fix left-link of the page to the right of the new right sibling */
-       if (!bkp_nextsib && xlrec->rnext != P_NONE)
+       if (xlrec->rnext != P_NONE && !(record->xl_info & XLR_BKP_BLOCK_2))
        {
                Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false);
+
                if (BufferIsValid(buffer))
                {
                        Page page = (Page) BufferGetPage(buffer);
@@ -410,6 +427,7 @@ btree_xlog_split(bool onleft, bool isroot,
                        if (!XLByteLE(lsn, PageGetLSN(page)))
                        {
                                BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
                                pageop->btpo_prev = xlrec->rightsib;
 
                                PageSetLSN(page, lsn);
@@ -770,48 +788,48 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
                        {
                                xl_btree_split *xlrec = (xl_btree_split *) rec;
 
-                               appendStringInfo(buf, "split_l: rel %u/%u/%u ",  
+                               appendStringInfo(buf, "split_l: rel %u/%u/%u ",
                                                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                                                 xlrec->node.relNode);
-                               appendStringInfo(buf, "left %u, right %u off %u level %u",
-                                                                xlrec->leftsib, xlrec->rightsib, 
-                                                                xlrec->firstright, xlrec->level);
+                               appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+                                                                xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+                                                                xlrec->level, xlrec->firstright);
                                break;
                        }
                case XLOG_BTREE_SPLIT_R:
                        {
                                xl_btree_split *xlrec = (xl_btree_split *) rec;
 
-                               appendStringInfo(buf, "split_r: rel %u/%u/%u ",  
+                               appendStringInfo(buf, "split_r: rel %u/%u/%u ",
                                                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                                                 xlrec->node.relNode);
-                               appendStringInfo(buf, "left %u, right %u off %u level %u",
-                                                                xlrec->leftsib, xlrec->rightsib, 
-                                                                xlrec->firstright, xlrec->level);
+                               appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+                                                                xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+                                                                xlrec->level, xlrec->firstright);
                                break;
                        }
                case XLOG_BTREE_SPLIT_L_ROOT:
                        {
                                xl_btree_split *xlrec = (xl_btree_split *) rec;
 
-                               appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",  
+                               appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",
                                                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                                                 xlrec->node.relNode);
-                               appendStringInfo(buf, "left %u, right %u off %u level %u",
-                                                                xlrec->leftsib, xlrec->rightsib, 
-                                                                xlrec->firstright, xlrec->level);
+                               appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+                                                                xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+                                                                xlrec->level, xlrec->firstright);
                                break;
                        }
                case XLOG_BTREE_SPLIT_R_ROOT:
                        {
                                xl_btree_split *xlrec = (xl_btree_split *) rec;
 
-                               appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",  
+                               appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",
                                                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                                                 xlrec->node.relNode);
-                               appendStringInfo(buf, "left %u, right %u off %u level %u",
-                                                                xlrec->leftsib, xlrec->rightsib, 
-                                                                xlrec->firstright, xlrec->level);
+                               appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
+                                                                xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
+                                                                xlrec->level, xlrec->firstright);
                                break;
                        }
                case XLOG_BTREE_DELETE:
index 53e10c80e8dc6885755b2be3484207584c571fef..c1a7d062400f85e0024264645b2128e8d3637fa5 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.112 2007/04/09 22:04:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.113 2007/04/11 20:47:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -260,17 +260,17 @@ typedef struct xl_btree_insert
 #define SizeOfBtreeInsert      (offsetof(xl_btreetid, tid) + SizeOfIptrData)
 
 /*
- * On insert with split we save items of both left and right siblings
- * and restore content of both pages from log record.  This way takes less
- * xlog space than the normal approach, because if we did it standardly,
+ * On insert with split, we save all the items going into the right sibling
+ * so that we can restore it completely from the log record.  This way takes
+ * less xlog space than the normal approach, because if we did it standardly,
  * XLogInsert would almost always think the right page is new and store its
- * whole page image.
+ * whole page image.  The left page, however, is handled in the normal
+ * incremental-update fashion.
  *
  * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
  * The _L and _R variants indicate whether the inserted tuple went into the
  * left or right split page (and thus, whether newitemoff and the new item
- * are stored or not.
- * page of the split pair).  The _ROOT variants indicate that we are splitting
+ * are stored or not).  The _ROOT variants indicate that we are splitting
  * the root page, and thus that a newroot record rather than an insert or
  * split record should follow. Note that a split record never carries a
  * metapage update --- we'll do that in the parent-level update.
@@ -278,20 +278,25 @@ typedef struct xl_btree_insert
 typedef struct xl_btree_split
 {
        RelFileNode node;
-       BlockNumber leftsib;     /* orig page / new left page */
-       BlockNumber rightsib;    /* new right page */
-       OffsetNumber firstright; /* first item stored on right page */
-       BlockNumber rnext;               /* next/right block pointer */
-       uint32          level;           /* tree level of page being split */
-
-       /* BlockIdData downlink follows if level > 0 */
-       
-       /* OffsetNumber newitemoff follows in the  _L variants. */
-       /* New item follows in the _L variants */
-       /* RIGHT PAGES TUPLES FOLLOW AT THE END */
+       BlockNumber leftsib;            /* orig page / new left page */
+       BlockNumber rightsib;           /* new right page */
+       BlockNumber rnext;                      /* next block (orig page's rightlink) */
+       uint32          level;                  /* tree level of page being split */
+       OffsetNumber firstright;        /* first item moved to right page */
+
+       /*
+        * If level > 0, BlockIdData downlink follows.  (We use BlockIdData
+        * rather than BlockNumber for alignment reasons: SizeOfBtreeSplit
+        * is only 16-bit aligned.)
+        *
+        * In the _L variants, next are OffsetNumber newitemoff and the new item.
+        * (In the _R variants, the new item is one of the right page's tuples.)
+        *
+        * Last are the right page's tuples in the form used by _bt_restore_page.
+        */
 } xl_btree_split;
 
-#define SizeOfBtreeSplit       (offsetof(xl_btree_split, level) + sizeof(uint32))
+#define SizeOfBtreeSplit       (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
 
 /*
  * This is what we need to know about delete of individual leaf index tuples.
index 7b2c9661010b3ef041d169730416d2fe6122617e..e2511988f35475efc707958536482c1a8933700d 100644 (file)
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.402 2007/04/09 22:04:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.403 2007/04/11 20:47:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     200704091
+#define CATALOG_VERSION_NO     200704111
 
 #endif