# Makefile for access/nbtree
#
# IDENTIFICATION
-# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $
+# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.12 2003/02/21 00:06:21 tgl Exp $
#
#-------------------------------------------------------------------------
include $(top_builddir)/src/Makefile.global
OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
- nbtstrat.o nbtutils.o nbtsort.o
+ nbtstrat.o nbtutils.o nbtsort.o nbtxlog.o
all: SUBSYS.o
-$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.6 2002/10/20 20:47:31 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.7 2003/02/21 00:06:21 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions
-on Database Systems, Vol 6, No. 4, December 1981, pp 650-670).
+on Database Systems, Vol 6, No. 4, December 1981, pp 650-670). We also
+use a simplified version of the deletion logic described in Lanin and
+Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm,
+Proceedings of 1986 Fall Joint Computer Conference, pp 380-389).
-We have made the following changes in order to incorporate their algorithm
+The Lehman and Yao algorithm and insertions
+-------------------------------------------
+
+We have made the following changes in order to incorporate the L&Y algorithm
into Postgres:
-+ The requirement that all btree keys be unique is too onerous,
- but the algorithm won't work correctly without it. Fortunately, it is
- only necessary that keys be unique on a single tree level, because L&Y
- only use the assumption of key uniqueness when re-finding a key in a
- parent node (to determine where to insert the key for a split page).
- Therefore, we can use the link field to disambiguate multiple
- occurrences of the same user key: only one entry in the parent level
- will be pointing at the page we had split. (Indeed we need not look at
- the real "key" at all, just at the link field.) We can distinguish
- items at the leaf level in the same way, by examining their links to
- heap tuples; we'd never have two items for the same heap tuple.
-
-+ Lehman and Yao assume that the key range for a subtree S is described
- by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
- node. This does not work for nonunique keys (for example, if we have
- enough equal keys to spread across several leaf pages, there *must* be
- some equal bounding keys in the first level up). Therefore we assume
- Ki <= v <= Ki+1 instead. A search that finds exact equality to a
- bounding key in an upper tree level must descend to the left of that
- key to ensure it finds any equal keys in the preceding page. An
- insertion that sees the high key of its target page is equal to the key
- to be inserted has a choice whether or not to move right, since the new
- key could go on either page. (Currently, we try to find a page where
- there is room for the new key without a split.)
-
-+ Lehman and Yao don't require read locks, but assume that in-memory
- copies of tree nodes are unshared. Postgres shares in-memory buffers
- among backends. As a result, we do page-level read locking on btree
- nodes in order to guarantee that no record is modified while we are
- examining it. This reduces concurrency but guaranteees correct
- behavior. An advantage is that when trading in a read lock for a
- write lock, we need not re-read the page after getting the write lock.
- Since we're also holding a pin on the shared buffer containing the
- page, we know that buffer still contains the page and is up-to-date.
-
-+ We support the notion of an ordered "scan" of an index as well as
- insertions, deletions, and simple lookups. A scan in the forward
- direction is no problem, we just use the right-sibling pointers that
- L&Y require anyway. (Thus, once we have descended the tree to the
- correct start point for the scan, the scan looks only at leaf pages
- and never at higher tree levels.) To support scans in the backward
- direction, we also store a "left sibling" link much like the "right
- sibling". (This adds an extra step to the L&Y split algorithm: while
- holding the write lock on the page being split, we also lock its former
- right sibling to update that page's left-link. This is safe since no
- writer of that page can be interested in acquiring a write lock on our
- page.) A backwards scan has one additional bit of complexity: after
- following the left-link we must account for the possibility that the
- left sibling page got split before we could read it. So, we have to
- move right until we find a page whose right-link matches the page we
- came from.
-
-+ Read locks on a page are held for as long as a scan is examining a page.
- But nbtree.c arranges to drop the read lock, but not the buffer pin,
- on the current page of a scan before control leaves nbtree. When we
- come back to resume the scan, we have to re-grab the read lock and
- then move right if the current item moved (see _bt_restscan()). Keeping
- the pin ensures that the current item cannot move left or be deleted
- (see btbulkdelete).
-
-+ In most cases we release our lock and pin on a page before attempting
- to acquire pin and lock on the page we are moving to. In a few places
- it is necessary to lock the next page before releasing the current one.
- This is safe when moving right or up, but not when moving left or down
- (else we'd create the possibility of deadlocks).
-
-+ Lehman and Yao fail to discuss what must happen when the root page
- becomes full and must be split. Our implementation is to split the
- root in the same way that any other page would be split, then construct
- a new root page holding pointers to both of the resulting pages (which
- now become siblings on level 2 of the tree). The new root page is then
- installed by altering the root pointer in the meta-data page (see
- below). This works because the root is not treated specially in any
- other way --- in particular, searches will move right using its link
- pointer if the link is set. Therefore, searches will find the data
- that's been moved into the right sibling even if they read the metadata
- page before it got updated. This is the same reasoning that makes a
- split of a non-root page safe. The locking considerations are similar too.
-
-+ Lehman and Yao assume fixed-size keys, but we must deal with
- variable-size keys. Therefore there is not a fixed maximum number of
- keys per page; we just stuff in as many as will fit. When we split a
- page, we try to equalize the number of bytes, not items, assigned to
- each of the resulting pages. Note we must include the incoming item in
- this calculation, otherwise it is possible to find that the incoming
- item doesn't fit on the split page where it needs to go!
-
-In addition, the following things are handy to know:
-
-+ Page zero of every btree is a meta-data page. This page stores
- the location of the root page, a pointer to a list of free
- pages, and other stuff that's handy to know. (Currently, we
- never shrink btree indexes so there are never any free pages.)
-
-+ The algorithm assumes we can fit at least three items per page
- (a "high key" and two real data items). Therefore it's unsafe
- to accept items larger than 1/3rd page size. Larger items would
- work sometimes, but could cause failures later on depending on
- what else gets put on their page.
-
-+ This algorithm doesn't guarantee btree consistency after a kernel crash
- or hardware failure. To do that, we'd need ordered writes, and UNIX
- doesn't support ordered writes (short of fsync'ing every update, which
- is too high a price). Rebuilding corrupted indexes during restart
- seems more attractive.
-
-+ Deletions are handled by getting a super-exclusive lock on the target
- page, so that no other backend has a pin on the page when the deletion
- starts. This means no scan is pointing at the page. This is OK for
- deleting leaf items, probably not OK for deleting internal nodes;
- will need to think harder when it's time to support index compaction.
-
-+ "ScanKey" data structures are used in two fundamentally different ways
- in this code. Searches for the initial position for a scan, as well as
- insertions, use scankeys in which the comparison function is a 3-way
- comparator (<0, =0, >0 result). These scankeys are built within the
- btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we
- are positioned, sequential examination of tuples in a scan is done by
- _bt_checkkeys() using scankeys in which the comparison functions return
- booleans --- for example, int4lt might be used. These scankeys are the
- ones originally passed in from outside the btree code. Same
- representation, but different comparison functions!
-
-Notes about data representation:
-
-+ The right-sibling link required by L&Y is kept in the page "opaque
- data" area, as is the left-sibling link and some flags.
-
-+ We also keep a parent link in the opaque data, but this link is not
- very trustworthy because it is not updated when the parent page splits.
- Thus, it points to some page on the parent level, but possibly a page
- well to the left of the page's actual current parent. In most cases
- we do not need this link at all. Normally we return to a parent page
- using a stack of entries that are made as we descend the tree, as in L&Y.
- There is exactly one case where the stack will not help: concurrent
- root splits. If an inserter process needs to split what had been the
- root when it started its descent, but finds that that page is no longer
- the root (because someone else split it meanwhile), then it uses the
- parent link to move up to the next level. This is OK because we do fix
- the parent link in a former root page when splitting it. This logic
- will work even if the root is split multiple times (even up to creation
- of multiple new levels) before an inserter returns to it. The same
- could not be said of finding the new root via the metapage, since that
- would work only for a single level of added root.
-
-+ The Postgres disk block data format (an array of items) doesn't fit
- Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
- so we have to play some games.
-
-+ On a page that is not rightmost in its tree level, the "high key" is
- kept in the page's first item, and real data items start at item 2.
- The link portion of the "high key" item goes unused. A page that is
- rightmost has no "high key", so data items start with the first item.
- Putting the high key at the left, rather than the right, may seem odd,
- but it avoids moving the high key as we add data items.
-
-+ On a leaf page, the data items are simply links to (TIDs of) tuples
- in the relation being indexed, with the associated key values.
-
-+ On a non-leaf page, the data items are down-links to child pages with
- bounding keys. The key in each data item is the *lower* bound for
- keys on that child page, so logically the key is to the left of that
- downlink. The high key (if present) is the upper bound for the last
- downlink. The first data item on each such page has no lower bound
- --- or lower bound of minus infinity, if you prefer. The comparison
- routines must treat it accordingly. The actual key stored in the
- item is irrelevant, and need not be stored at all. This arrangement
- corresponds to the fact that an L&Y non-leaf page has one more pointer
- than key.
-
-Notes to operator class implementors:
-
-+ With this implementation, we require each supported datatype to supply
- us with a comparison procedure via pg_amproc. This procedure must take
- two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
- A = B, or A > B, respectively. See nbtcompare.c for examples.
+The requirement that all btree keys be unique is too onerous,
+but the algorithm won't work correctly without it. Fortunately, it is
+only necessary that keys be unique on a single tree level, because L&Y
+only use the assumption of key uniqueness when re-finding a key in a
+parent page (to determine where to insert the key for a split page).
+Therefore, we can use the link field to disambiguate multiple
+occurrences of the same user key: only one entry in the parent level
+will be pointing at the page we had split. (Indeed we need not look at
+the real "key" at all, just at the link field.) We can distinguish
+items at the leaf level in the same way, by examining their links to
+heap tuples; we'd never have two items for the same heap tuple.
+
+Lehman and Yao assume that the key range for a subtree S is described
+by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
+page. This does not work for nonunique keys (for example, if we have
+enough equal keys to spread across several leaf pages, there *must* be
+some equal bounding keys in the first level up). Therefore we assume
+Ki <= v <= Ki+1 instead. A search that finds exact equality to a
+bounding key in an upper tree level must descend to the left of that
+key to ensure it finds any equal keys in the preceding page. An
+insertion that sees the high key of its target page is equal to the key
+to be inserted has a choice whether or not to move right, since the new
+key could go on either page. (Currently, we try to find a page where
+there is room for the new key without a split.)
+
+Lehman and Yao don't require read locks, but assume that in-memory
+copies of tree pages are unshared. Postgres shares in-memory buffers
+among backends. As a result, we do page-level read locking on btree
+pages in order to guarantee that no record is modified while we are
+examining it. This reduces concurrency but guaranteees correct
+behavior. An advantage is that when trading in a read lock for a
+write lock, we need not re-read the page after getting the write lock.
+Since we're also holding a pin on the shared buffer containing the
+page, we know that buffer still contains the page and is up-to-date.
+
+We support the notion of an ordered "scan" of an index as well as
+insertions, deletions, and simple lookups. A scan in the forward
+direction is no problem, we just use the right-sibling pointers that
+L&Y require anyway. (Thus, once we have descended the tree to the
+correct start point for the scan, the scan looks only at leaf pages
+and never at higher tree levels.) To support scans in the backward
+direction, we also store a "left sibling" link much like the "right
+sibling". (This adds an extra step to the L&Y split algorithm: while
+holding the write lock on the page being split, we also lock its former
+right sibling to update that page's left-link. This is safe since no
+writer of that page can be interested in acquiring a write lock on our
+page.) A backwards scan has one additional bit of complexity: after
+following the left-link we must account for the possibility that the
+left sibling page got split before we could read it. So, we have to
+move right until we find a page whose right-link matches the page we
+came from. (Actually, it's even harder than that; see deletion discussion
+below.)
+
+Read locks on a page are held for as long as a scan is examining a page.
+But nbtree.c arranges to drop the read lock, but not the buffer pin,
+on the current page of a scan before control leaves nbtree. When we
+come back to resume the scan, we have to re-grab the read lock and
+then move right if the current item moved (see _bt_restscan()). Keeping
+the pin ensures that the current item cannot move left or be deleted
+(see btbulkdelete).
+
+In most cases we release our lock and pin on a page before attempting
+to acquire pin and lock on the page we are moving to. In a few places
+it is necessary to lock the next page before releasing the current one.
+This is safe when moving right or up, but not when moving left or down
+(else we'd create the possibility of deadlocks).
+
+Lehman and Yao fail to discuss what must happen when the root page
+becomes full and must be split. Our implementation is to split the
+root in the same way that any other page would be split, then construct
+a new root page holding pointers to both of the resulting pages (which
+now become siblings on the next level of the tree). The new root page
+is then installed by altering the root pointer in the meta-data page (see
+below). This works because the root is not treated specially in any
+other way --- in particular, searches will move right using its link
+pointer if the link is set. Therefore, searches will find the data
+that's been moved into the right sibling even if they read the meta-data
+page before it got updated. This is the same reasoning that makes a
+split of a non-root page safe. The locking considerations are similar too.
+
+When an inserter recurses up the tree, splitting internal pages to insert
+links to pages inserted on the level below, it is possible that it will
+need to access a page above the level that was the root when it began its
+descent (or more accurately, the level that was the root when it read the
+meta-data page). In this case the stack it made while descending does not
+help for finding the correct page. When this happens, we find the correct
+place by re-descending the tree until we reach the level one above the
+level we need to insert a link to, and then moving right as necessary.
+(Typically this will take only two fetches, the meta-data page and the new
+root, but in principle there could have been more than one root split
+since we saw the root. We can identify the correct tree level by means of
+the level numbers stored in each page. The situation is rare enough that
+we do not need a more efficient solution.)
+
+Lehman and Yao assume fixed-size keys, but we must deal with
+variable-size keys. Therefore there is not a fixed maximum number of
+keys per page; we just stuff in as many as will fit. When we split a
+page, we try to equalize the number of bytes, not items, assigned to
+each of the resulting pages. Note we must include the incoming item in
+this calculation, otherwise it is possible to find that the incoming
+item doesn't fit on the split page where it needs to go!
+
+The deletion algorithm
+----------------------
+
+Deletions of leaf items are handled by getting a super-exclusive lock on
+the target page, so that no other backend has a pin on the page when the
+deletion starts. This means no scan is pointing at the page, so no other
+backend can lose its place due to the item deletion.
+
+The above does not work for deletion of items in internal pages, since
+other backends keep no lock nor pin on a page they have descended past.
+Instead, when a backend is ascending the tree using its stack, it must
+be prepared for the possibility that the item it wants is to the left of
+the recorded position (but it can't have moved left out of the recorded
+page). Since we hold a lock on the lower page (per L&Y) until we have
+re-found the parent item that links to it, we can be assured that the
+parent item does still exist and can't have been deleted. Also, because
+we are matching downlink page numbers and not data keys, we don't have any
+problem with possibly misidentifying the parent item.
+
+We consider deleting an entire page from the btree only when it's become
+completely empty of items. (Merging partly-full pages would allow better
+space reuse, but it seems impractical to move existing data items left or
+right to make this happen --- a scan moving in the opposite direction
+might miss the items if so. We could do it during VACUUM FULL, though.)
+Also, we *never* delete the rightmost page on a tree level (this
+restriction simplifies the traversal algorithms, as explained below).
+
+To delete an empty page, we acquire write lock on its left sibling (if
+any), the target page itself, the right sibling (there must be one), and
+the parent page, in that order. The parent page must be found using the
+same type of search as used to find the parent during an insertion split.
+Then we update the side-links in the siblings, mark the target page
+deleted, and remove the downlink from the parent, as well as the parent's
+upper bounding key for the target (the one separating it from its right
+sibling). This causes the target page's key space to effectively belong
+to its right sibling. (Neither the left nor right sibling pages need to
+change their "high key" if any; so there is no problem with possibly not
+having enough space to replace a high key.) The side-links in the target
+page are not changed.
+
+(Note: Lanin and Shasha prefer to make the key space move left, but their
+argument for doing so hinges on not having left-links, which we have
+anyway. So we simplify the algorithm by moving key space right.)
+
+To preserve consistency on the parent level, we cannot merge the key space
+of a page into its right sibling unless the right sibling is a child of
+the same parent --- otherwise, the parent's key space assignment changes
+too, meaning we'd have to make bounding-key updates in its parent, and
+perhaps all the way up the tree. Since we can't possibly do that
+atomically, we forbid this case. That means that the rightmost child of a
+parent node can't be deleted unless it's the only remaining child.
+
+When we delete the last remaining child of a parent page, we mark the
+parent page "half-dead" as part of the atomic update that deletes the
+child page. This implicitly transfers the parent's key space to its right
+sibling (which it must have, since we never delete the overall-rightmost
+page of a level). No future insertions into the parent level are allowed
+to insert keys into the half-dead page --- they must move right to its
+sibling, instead. The parent remains empty and can be deleted in a
+separate atomic action. (However, if it's the rightmost child of its own
+parent, it might have to stay half-dead for awhile, until it's also the
+only child.)
+
+Note that an empty leaf page is a valid tree state, but an empty interior
+page is not legal (an interior page must have children to delegate its
+key space to). So an interior page *must* be marked half-dead as soon
+as its last child is deleted.
+
+The notion of a half-dead page means that the key space relationship between
+the half-dead page's level and its parent's level may be a little out of
+whack: key space that appears to belong to the half-dead page's parent on the
+parent level may really belong to its right sibling. We can tolerate this,
+however, because insertions and deletions on upper tree levels are always
+done by reference to child page numbers, not keys. The only cost is that
+searches may sometimes descend to the half-dead page and then have to move
+right, rather than going directly to the sibling page.
+
+A deleted page cannot be reclaimed immediately, since there may be other
+processes waiting to reference it (ie, search processes that just left the
+parent, or scans moving right or left from one of the siblings). These
+processes must observe that the page is marked dead and recover
+accordingly. Searches and forward scans simply follow the right-link
+until they find a non-dead page --- this will be where the deleted page's
+key-space moved to.
+
+Stepping left in a backward scan is complicated because we must consider
+the possibility that the left sibling was just split (meaning we must find
+the rightmost page derived from the left sibling), plus the possibility
+that the page we were just on has now been deleted and hence isn't in the
+sibling chain at all anymore. So the move-left algorithm becomes:
+0. Remember the page we are on as the "original page".
+1. Follow the original page's left-link (we're done if this is zero).
+2. If the current page is live and its right-link matches the "original
+ page", we are done.
+3. Otherwise, move right one or more times looking for a live page whose
+ right-link matches the "original page". If found, we are done. (In
+ principle we could scan all the way to the right end of the index, but
+ in practice it seems better to give up after a small number of tries.
+ It's unlikely the original page's sibling split more than a few times
+ while we were in flight to it; if we do not find a matching link in a
+ few tries, then most likely the original page is deleted.)
+4. Return to the "original page". If it is still live, return to step 1
+ (we guessed wrong about it being deleted, and should restart with its
+ current left-link). If it is dead, move right until a non-dead page
+ is found (there must be one, since rightmost pages are never deleted),
+ mark that as the new "original page", and return to step 1.
+This algorithm is correct because the live page found by step 4 will have
+the same left keyspace boundary as the page we started from. Therefore,
+when we ultimately exit, it must be on a page whose right keyspace
+boundary matches the left boundary of where we started --- which is what
+we need to be sure we don't miss or re-scan any items.
+
+A deleted page can only be reclaimed once there is no scan or search that
+has a reference to it; until then, it must stay in place with its
+right-link undisturbed. We implement this by waiting until all
+transactions that were running at the time of deletion are dead; which is
+overly strong, but is simple to implement within Postgres. When marked
+dead, a deleted page is labeled with the next-transaction counter value.
+VACUUM can reclaim the page for re-use when this transaction number is
+older than the oldest open transaction. (NOTE: VACUUM FULL can reclaim
+such pages immediately.)
+
+Reclaiming a page doesn't actually change its state on disk --- we simply
+record it in the shared-memory free space map, from which it will be
+handed out the next time a new page is needed for a page split. The
+deleted page's contents will be overwritten by the split operation.
+(Note: if we find a deleted page with an extremely old transaction
+number, it'd be worthwhile to re-mark it with FrozenTransactionId so that
+a later xid wraparound can't cause us to think the page is unreclaimable.
+But in more normal situations this would be a waste of a disk write.)
+
+Because we never delete the rightmost page of any level (and in particular
+never delete the root), it's impossible for the height of the tree to
+decrease. After massive deletions we might have a scenario in which the
+tree is "skinny", with several single-page levels below the root.
+Operations will still be correct in this case, but we'd waste cycles
+descending through the single-page levels. To handle this we use an idea
+from Lanin and Shasha: we keep track of the "fast root" level, which is
+the lowest single-page level. The meta-data page keeps a pointer to this
+level as well as the true root. All ordinary operations initiate their
+searches at the fast root not the true root. When we split a page that is
+alone on its level or delete the next-to-last page on a level (both cases
+are easily detected), we have to make sure that the fast root pointer is
+adjusted appropriately. In the split case, we do this work as part of the
+atomic update for the insertion into the parent level; in the delete case
+as part of the atomic update for the delete (either way, the metapage has
+to be the last page locked in the update to avoid deadlock risks). This
+avoids race conditions if two such operations are executing concurrently.
+
+VACUUM needs to do a linear scan of an index to search for empty leaf
+pages and half-dead parent pages that can be deleted, as well as deleted
+pages that can be reclaimed because they are older than all open
+transactions.
+
+WAL considerations
+------------------
+
+The insertion and deletion algorithms in themselves don't guarantee btree
+consistency after a crash. To provide robustness, we depend on WAL
+replay. A single WAL entry is effectively an atomic action --- we can
+redo it from the log if it fails to complete.
+
+Ordinary item insertions (that don't force a page split) are of course
+single WAL entries, since they only affect one page. The same for
+leaf-item deletions (if the deletion brings the leaf page to zero items,
+it is now a candidate to be deleted, but that is a separate action).
+
+An insertion that causes a page split is logged as a single WAL entry for
+the changes occuring on the insertion's level --- including update of the
+right sibling's left-link --- followed by a second WAL entry for the
+insertion on the parent level (which might itself be a page split, requiring
+an additional insertion above that, etc).
+
+For a root split, the followon WAL entry is a "new root" entry rather than
+an "insertion" entry, but details are otherwise much the same.
+
+Because insertion involves multiple atomic actions, the WAL replay logic
+has to detect the case where a page split isn't followed by a matching
+insertion on the parent level, and then do that insertion on its own (and
+recursively for any subsequent parent insertion, of course). This is
+feasible because the WAL entry for the split contains enough info to know
+what must be inserted in the parent level.
+
+When splitting a non-root page that is alone on its level, the required
+metapage update (of the "fast root" link) is performed and logged as part
+of the insertion into the parent level. When splitting the root page, the
+metapage update is handled as part of the "new root" action.
+
+A page deletion is logged as a single WAL entry covering all four
+required page updates (target page, left and right siblings, and parent)
+as an atomic event. (Any required fast-root link update is also part
+of the WAL entry.) If the parent page becomes half-dead but is not
+immediately deleted due to a subsequent crash, there is no loss of
+consistency, and the empty page will be picked up by the next VACUUM.
+
+Other things that are handy to know
+-----------------------------------
+
+Page zero of every btree is a meta-data page. This page stores the
+location of the root page --- both the true root and the current effective
+root ("fast" root).
+
+The algorithm assumes we can fit at least three items per page
+(a "high key" and two real data items). Therefore it's unsafe
+to accept items larger than 1/3rd page size. Larger items would
+work sometimes, but could cause failures later on depending on
+what else gets put on their page.
+
+"ScanKey" data structures are used in two fundamentally different ways
+in this code. Searches for the initial position for a scan, as well as
+insertions, use scankeys in which the comparison function is a 3-way
+comparator (<0, =0, >0 result). These scankeys are built within the
+btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we
+are positioned, sequential examination of tuples in a scan is done by
+_bt_checkkeys() using scankeys in which the comparison functions return
+booleans --- for example, int4lt might be used. These scankeys are the
+ones originally passed in from outside the btree code. Same
+representation, but different comparison functions!
+
+Notes about data representation
+-------------------------------
+
+The right-sibling link required by L&Y is kept in the page "opaque
+data" area, as is the left-sibling link, the page level, and some flags.
+The page level counts upwards from zero at the leaf level, to the tree
+depth minus 1 at the root. (Counting up from the leaves ensures that we
+don't need to renumber any existing pages when splitting the root.)
+
+The Postgres disk block data format (an array of items) doesn't fit
+Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
+so we have to play some games.
+
+On a page that is not rightmost in its tree level, the "high key" is
+kept in the page's first item, and real data items start at item 2.
+The link portion of the "high key" item goes unused. A page that is
+rightmost has no "high key", so data items start with the first item.
+Putting the high key at the left, rather than the right, may seem odd,
+but it avoids moving the high key as we add data items.
+
+On a leaf page, the data items are simply links to (TIDs of) tuples
+in the relation being indexed, with the associated key values.
+
+On a non-leaf page, the data items are down-links to child pages with
+bounding keys. The key in each data item is the *lower* bound for
+keys on that child page, so logically the key is to the left of that
+downlink. The high key (if present) is the upper bound for the last
+downlink. The first data item on each such page has no lower bound
+--- or lower bound of minus infinity, if you prefer. The comparison
+routines must treat it accordingly. The actual key stored in the
+item is irrelevant, and need not be stored at all. This arrangement
+corresponds to the fact that an L&Y non-leaf page has one more pointer
+than key.
+
+Notes to operator class implementors
+------------------------------------
+
+With this implementation, we require each supported datatype to supply
+us with a comparison procedure via pg_amproc. This procedure must take
+two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
+A = B, or A > B, respectively. See nbtcompare.c for examples.
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.96 2002/09/04 20:31:09 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.97 2003/02/21 00:06:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
int best_delta; /* best size delta so far */
} FindSplitData;
-extern bool FixBTree;
-
-Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
-static void _bt_fixtree(Relation rel, BlockNumber blkno);
-static void _bt_fixbranch(Relation rel, BlockNumber lblkno,
- BlockNumber rblkno, BTStack true_stack);
-static void _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit);
-static void _bt_fixup(Relation rel, Buffer buf);
-static OffsetNumber _bt_getoff(Page page, BlockNumber blkno);
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
BTStack stack,
int keysz, ScanKey scankey,
BTItem btitem,
- OffsetNumber afteritem);
-static void _bt_insertuple(Relation rel, Buffer buf,
- Size itemsz, BTItem btitem, OffsetNumber newitemoff);
+ OffsetNumber afteritem,
+ bool split_only_page);
static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
OffsetNumber newitemoff, Size newitemsz,
BTItem newitem, bool newitemonleft,
}
/* do the insertion */
- res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem, 0);
+ res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem,
+ 0, false);
/* be tidy */
_bt_freestack(stack);
* right using information stored in the parent stack).
* + invokes itself with the appropriate tuple for the right
* child page on the parent.
+ * + updates the metapage if a true root or fast root is split.
*
* On entry, we must have the right buffer on which to do the
* insertion, and the buffer must be pinned and locked. On return,
int keysz,
ScanKey scankey,
BTItem btitem,
- OffsetNumber afteritem)
+ OffsetNumber afteritem,
+ bool split_only_page)
{
InsertIndexResult res;
Page page;
*/
if (PageGetFreeSpace(page) < itemsz)
{
- Buffer rbuf;
- BlockNumber bknum = BufferGetBlockNumber(buf);
- BlockNumber rbknum;
bool is_root = P_ISROOT(lpageop);
+ bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
bool newitemonleft;
+ Buffer rbuf;
/* Choose the split point */
firstright = _bt_findsplitloc(rel, page,
* locks for the child pages until we locate the parent, but we can
* release them before doing the actual insertion (see Lehman and Yao
* for the reasoning).
- *
- * Here we have to do something Lehman and Yao don't talk about:
- * deal with a root split and construction of a new root. If our
- * stack is empty then we have just split a node on what had been
- * the root level when we descended the tree. If it is still the
- * root then we perform a new-root construction. If it *wasn't*
- * the root anymore, use the parent pointer to get up to the root
- * level that someone constructed meanwhile, and find the right
- * place to insert as for the normal case.
*----------
*/
+ _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
+ }
+ else
+ {
+ Buffer metabuf = InvalidBuffer;
+ Page metapg = NULL;
+ BTMetaPageData *metad = NULL;
- if (is_root)
- {
- Buffer rootbuf;
-
- Assert(stack == (BTStack) NULL);
- /* create a new root node and release the split buffers */
- rootbuf = _bt_newroot(rel, buf, rbuf);
- _bt_wrtbuf(rel, rootbuf);
- _bt_wrtbuf(rel, rbuf);
- _bt_wrtbuf(rel, buf);
- }
- else
+ itup_off = newitemoff;
+ itup_blkno = BufferGetBlockNumber(buf);
+
+ /*
+ * If we are doing this insert because we split a page that was
+ * the only one on its tree level, but was not the root, it may
+ * have been the "fast root". We need to ensure that the fast root
+ * link points at or above the current page. We can safely acquire
+ * a lock on the metapage here --- see comments for _bt_newroot().
+ */
+ if (split_only_page)
{
- InsertIndexResult newres;
- BTItem new_item;
- BTStackData fakestack;
- BTItem ritem;
- Buffer pbuf;
-
- /* If root page was splitted */
- if (stack == (BTStack) NULL)
- {
- elog(LOG, "btree: concurrent ROOT page split");
-
- /*
- * If root page splitter failed to create new root page
- * then old root' btpo_parent still points to metapage. We
- * have to fix root page in this case.
- */
- if (BTreeInvalidParent(lpageop))
- {
- if (!FixBTree)
- elog(ERROR, "bt_insertonpg[%s]: no root page found", RelationGetRelationName(rel));
- _bt_wrtbuf(rel, rbuf);
- _bt_wrtnorelbuf(rel, buf);
- elog(WARNING, "bt_insertonpg[%s]: root page unfound - fixing upper levels", RelationGetRelationName(rel));
- _bt_fixup(rel, buf);
- goto formres;
- }
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
- /*
- * Set up a phony stack entry if we haven't got a real one
- */
- stack = &fakestack;
- stack->bts_blkno = lpageop->btpo_parent;
- stack->bts_offset = InvalidOffsetNumber;
- /* bts_btitem will be initialized below */
- stack->bts_parent = NULL;
+ if (metad->btm_fastlevel >= lpageop->btpo.level)
+ {
+ /* no update wanted */
+ _bt_relbuf(rel, metabuf);
+ metabuf = InvalidBuffer;
}
+ }
- /* get high key from left page == lowest key on new right page */
- ritem = (BTItem) PageGetItem(page,
- PageGetItemId(page, P_HIKEY));
+ /* Do the actual update. No elog(ERROR) until changes are logged */
+ START_CRIT_SECTION();
- /* form an index tuple that points at the new right page */
- new_item = _bt_formitem(&(ritem->bti_itup));
- rbknum = BufferGetBlockNumber(rbuf);
- ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+ _bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
- /*
- * Find the parent buffer and get the parent page.
- *
- * Oops - if we were moved right then we need to change stack
- * item! We want to find parent pointing to where we are,
- * right ? - vadim 05/27/97
- *
- * Interestingly, this means we didn't *really* need to stack the
- * parent key at all; all we really care about is the saved
- * block and offset as a starting point for our search...
- */
- ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
- bknum, P_HIKEY);
+ if (BufferIsValid(metabuf))
+ {
+ metad->btm_fastroot = itup_blkno;
+ metad->btm_fastlevel = lpageop->btpo.level;
+ }
- pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ /* XLOG stuff */
+ if (!rel->rd_istemp)
+ {
+ xl_btree_insert xlrec;
+ xl_btree_metadata xlmeta;
+ uint8 xlinfo;
+ XLogRecPtr recptr;
+ XLogRecData rdata[3];
+ XLogRecData *nextrdata;
+ BTItemData truncitem;
+
+ xlrec.target.node = rel->rd_node;
+ ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBtreeInsert;
+ rdata[0].next = nextrdata = &(rdata[1]);
+
+ if (BufferIsValid(metabuf))
+ {
+ xlmeta.root = metad->btm_root;
+ xlmeta.level = metad->btm_level;
+ xlmeta.fastroot = metad->btm_fastroot;
+ xlmeta.fastlevel = metad->btm_fastlevel;
+
+ nextrdata->buffer = InvalidBuffer;
+ nextrdata->data = (char *) &xlmeta;
+ nextrdata->len = sizeof(xl_btree_metadata);
+ nextrdata->next = nextrdata + 1;
+ nextrdata++;
+ xlinfo = XLOG_BTREE_INSERT_META;
+ }
+ else if (P_ISLEAF(lpageop))
+ xlinfo = XLOG_BTREE_INSERT_LEAF;
+ else
+ xlinfo = XLOG_BTREE_INSERT_UPPER;
- /* Now we can write and unlock the children */
- _bt_wrtbuf(rel, rbuf);
- _bt_wrtbuf(rel, buf);
+ /* Read comments in _bt_pgaddtup */
+ if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
+ {
+ truncitem = *btitem;
+ truncitem.bti_itup.t_info = sizeof(BTItemData);
+ nextrdata->data = (char *) &truncitem;
+ nextrdata->len = sizeof(BTItemData);
+ }
+ else
+ {
+ nextrdata->data = (char *) btitem;
+ nextrdata->len = IndexTupleDSize(btitem->bti_itup) +
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
+ }
+ nextrdata->buffer = buf;
+ nextrdata->next = NULL;
+
+ recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
- if (pbuf == InvalidBuffer)
+ if (BufferIsValid(metabuf))
{
- if (!FixBTree)
- elog(ERROR, "_bt_getstackbuf: my bits moved right off the end of the world!"
- "\n\tRecreate index %s.", RelationGetRelationName(rel));
- pfree(new_item);
- elog(WARNING, "bt_insertonpg[%s]: parent page unfound - fixing branch", RelationGetRelationName(rel));
- _bt_fixbranch(rel, bknum, rbknum, stack);
- goto formres;
+ PageSetLSN(metapg, recptr);
+ PageSetSUI(metapg, ThisStartUpID);
}
- /* Recursively update the parent */
- newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
- 0, NULL, new_item, stack->bts_offset);
- /* be tidy */
- pfree(newres);
- pfree(new_item);
+ PageSetLSN(page, recptr);
+ PageSetSUI(page, ThisStartUpID);
}
- }
- else
- {
- itup_off = newitemoff;
- itup_blkno = BufferGetBlockNumber(buf);
- _bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
+ END_CRIT_SECTION();
/* Write out the updated page and release pin/lock */
+ if (BufferIsValid(metabuf))
+ _bt_wrtbuf(rel, metabuf);
+
_bt_wrtbuf(rel, buf);
}
-formres:;
/* by here, the new tuple is inserted at itup_blkno/itup_off */
res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
return res;
}
-static void
-_bt_insertuple(Relation rel, Buffer buf,
- Size itemsz, BTItem btitem, OffsetNumber newitemoff)
-{
- Page page = BufferGetPage(buf);
- BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- START_CRIT_SECTION();
-
- _bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
-
- /* XLOG stuff */
- if (!rel->rd_istemp)
- {
- xl_btree_insert xlrec;
- uint8 flag = XLOG_BTREE_INSERT;
- XLogRecPtr recptr;
- XLogRecData rdata[2];
- BTItemData truncitem;
-
- xlrec.target.node = rel->rd_node;
- ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(buf), newitemoff);
- rdata[0].buffer = InvalidBuffer;
- rdata[0].data = (char *) &xlrec;
- rdata[0].len = SizeOfBtreeInsert;
- rdata[0].next = &(rdata[1]);
-
- /* Read comments in _bt_pgaddtup */
- if (!(P_ISLEAF(pageop)) && newitemoff == P_FIRSTDATAKEY(pageop))
- {
- truncitem = *btitem;
- truncitem.bti_itup.t_info = sizeof(BTItemData);
- rdata[1].data = (char *) &truncitem;
- rdata[1].len = sizeof(BTItemData);
- }
- else
- {
- rdata[1].data = (char *) btitem;
- rdata[1].len = IndexTupleDSize(btitem->bti_itup) +
- (sizeof(BTItemData) - sizeof(IndexTupleData));
- }
- rdata[1].buffer = buf;
- rdata[1].next = NULL;
- if (P_ISLEAF(pageop))
- flag |= XLOG_BTREE_LEAF;
-
- recptr = XLogInsert(RM_BTREE_ID, flag, rdata);
-
- PageSetLSN(page, recptr);
- PageSetSUI(page, ThisStartUpID);
- }
-
- END_CRIT_SECTION();
-}
-
/*
* _bt_split() -- split a page in the btree.
*
lopaque->btpo_next = BufferGetBlockNumber(rbuf);
ropaque->btpo_prev = BufferGetBlockNumber(buf);
ropaque->btpo_next = oopaque->btpo_next;
-
- /*
- * Must copy the original parent link into both new pages, even though
- * it might be quite obsolete by now. We might need it if this level
- * is or recently was the root (see README).
- */
- lopaque->btpo_parent = ropaque->btpo_parent = oopaque->btpo_parent;
+ lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
/*
* If the page we're splitting is not the rightmost page at its level
if (!rel->rd_istemp)
{
xl_btree_split xlrec;
- int flag = (newitemonleft) ?
- XLOG_BTREE_SPLEFT : XLOG_BTREE_SPLIT;
- BlockNumber blkno;
+ uint8 xlinfo;
XLogRecPtr recptr;
XLogRecData rdata[4];
xlrec.target.node = rel->rd_node;
ItemPointerSet(&(xlrec.target.tid), *itup_blkno, *itup_off);
if (newitemonleft)
- {
- blkno = BufferGetBlockNumber(rbuf);
- BlockIdSet(&(xlrec.otherblk), blkno);
- }
+ xlrec.otherblk = BufferGetBlockNumber(rbuf);
else
- {
- blkno = BufferGetBlockNumber(buf);
- BlockIdSet(&(xlrec.otherblk), blkno);
- }
- BlockIdSet(&(xlrec.parentblk), lopaque->btpo_parent);
- BlockIdSet(&(xlrec.leftblk), lopaque->btpo_prev);
- BlockIdSet(&(xlrec.rightblk), ropaque->btpo_next);
+ xlrec.otherblk = BufferGetBlockNumber(buf);
+ xlrec.leftblk = lopaque->btpo_prev;
+ xlrec.rightblk = ropaque->btpo_next;
+ xlrec.level = lopaque->btpo.level;
/*
* Direct access to page is not good but faster - we should
- * implement some new func in page API.
+ * implement some new func in page API. Note we only store the
+ * tuples themselves, knowing that the item pointers are in the
+ * same order and can be reconstructed by scanning the tuples.
*/
xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
((PageHeader) leftpage)->pd_upper;
+
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeSplit;
rdata[3].next = NULL;
}
- if (P_ISLEAF(lopaque))
- flag |= XLOG_BTREE_LEAF;
+ if (P_ISROOT(oopaque))
+ xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
+ else
+ xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
- recptr = XLogInsert(RM_BTREE_ID, flag, rdata);
+ recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
PageSetLSN(leftpage, recptr);
PageSetSUI(leftpage, ThisStartUpID);
}
}
+/*
+ * _bt_insert_parent() -- Insert downlink into parent after a page split.
+ *
+ * On entry, buf and rbuf are the left and right split pages, which we
+ * still hold write locks on per the L&Y algorithm. We release the
+ * write locks once we have write lock on the parent page. (Any sooner,
+ * and it'd be possible for some other process to try to split or delete
+ * one of these pages, and get confused because it cannot find the downlink.)
+ *
+ * stack - stack showing how we got here. May be NULL in cases that don't
+ * have to be efficient (concurrent ROOT split, WAL recovery)
+ * is_root - we split the true root
+ * is_only - we split a page alone on its level (might have been fast root)
+ *
+ * This is exported so it can be called by nbtxlog.c.
+ */
+void
+_bt_insert_parent(Relation rel,
+ Buffer buf,
+ Buffer rbuf,
+ BTStack stack,
+ bool is_root,
+ bool is_only)
+{
+ /*
+ * Here we have to do something Lehman and Yao don't talk about:
+ * deal with a root split and construction of a new root. If our
+ * stack is empty then we have just split a node on what had been
+ * the root level when we descended the tree. If it was still the
+ * root then we perform a new-root construction. If it *wasn't*
+ * the root anymore, search to find the next higher level that
+ * someone constructed meanwhile, and find the right place to insert
+ * as for the normal case.
+ *
+ * If we have to search for the parent level, we do so by
+ * re-descending from the root. This is not super-efficient,
+ * but it's rare enough not to matter. (This path is also taken
+ * when called from WAL recovery --- we have no stack in that case.)
+ */
+ if (is_root)
+ {
+ Buffer rootbuf;
+
+ Assert(stack == (BTStack) NULL);
+ Assert(is_only);
+ /* create a new root node and update the metapage */
+ rootbuf = _bt_newroot(rel, buf, rbuf);
+ /* release the split buffers */
+ _bt_wrtbuf(rel, rootbuf);
+ _bt_wrtbuf(rel, rbuf);
+ _bt_wrtbuf(rel, buf);
+ }
+ else
+ {
+ BlockNumber bknum = BufferGetBlockNumber(buf);
+ BlockNumber rbknum = BufferGetBlockNumber(rbuf);
+ Page page = BufferGetPage(buf);
+ InsertIndexResult newres;
+ BTItem new_item;
+ BTStackData fakestack;
+ BTItem ritem;
+ Buffer pbuf;
+
+ if (stack == (BTStack) NULL)
+ {
+ BTPageOpaque lpageop;
+
+ if (!InRecovery)
+ elog(DEBUG1, "_bt_insert_parent: concurrent ROOT page split");
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ /* Find the leftmost page at the next level up */
+ pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
+ /* Set up a phony stack entry pointing there */
+ stack = &fakestack;
+ stack->bts_blkno = BufferGetBlockNumber(pbuf);
+ stack->bts_offset = InvalidOffsetNumber;
+ /* bts_btitem will be initialized below */
+ stack->bts_parent = NULL;
+ _bt_relbuf(rel, pbuf);
+ }
+
+ /* get high key from left page == lowest key on new right page */
+ ritem = (BTItem) PageGetItem(page,
+ PageGetItemId(page, P_HIKEY));
+
+ /* form an index tuple that points at the new right page */
+ new_item = _bt_formitem(&(ritem->bti_itup));
+ ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+
+ /*
+ * Find the parent buffer and get the parent page.
+ *
+ * Oops - if we were moved right then we need to change stack
+ * item! We want to find parent pointing to where we are,
+ * right ? - vadim 05/27/97
+ */
+ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
+ bknum, P_HIKEY);
+
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+
+ /* Now we can write and unlock the children */
+ _bt_wrtbuf(rel, rbuf);
+ _bt_wrtbuf(rel, buf);
+
+ /* Check for error only after writing children */
+ if (pbuf == InvalidBuffer)
+ elog(ERROR, "_bt_getstackbuf: my bits moved right off the end of the world!"
+ "\n\tRecreate index %s.", RelationGetRelationName(rel));
+
+ /* Recursively update the parent */
+ newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ 0, NULL, new_item, stack->bts_offset,
+ is_only);
+
+ /* be tidy */
+ pfree(newres);
+ pfree(new_item);
+ }
+}
+
/*
* _bt_getstackbuf() -- Walk back up the tree one step, and find the item
* we last looked at in the parent.
*
- * This is possible because we save a bit image of the last item
- * we looked at in the parent, and the update algorithm guarantees
- * that if items above us in the tree move, they only move right.
+ * This is possible because we save the downlink from the parent item,
+ * which is enough to uniquely identify it. Insertions into the parent
+ * level could cause the item to move right; deletions could cause it
+ * to move left, but not left of the page we previously found it in.
*
- * Also, re-set bts_blkno & bts_offset if changed.
+ * Adjusts bts_blkno & bts_offset if changed.
+ *
+ * Returns InvalidBuffer if item not found (should not happen).
*/
static Buffer
_bt_getstackbuf(Relation rel, BTStack stack, int access)
{
BlockNumber blkno;
- Buffer buf;
- OffsetNumber start,
- offnum,
- maxoff;
- Page page;
- ItemId itemid;
- BTItem item;
- BTPageOpaque opaque;
+ OffsetNumber start;
blkno = stack->bts_blkno;
- buf = _bt_getbuf(rel, blkno, access);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
-
start = stack->bts_offset;
- /*
- * _bt_insertonpg set bts_offset to InvalidOffsetNumber in the case of
- * concurrent ROOT page split. Also, watch out for possibility that
- * page has a high key now when it didn't before.
- */
- if (start < P_FIRSTDATAKEY(opaque))
- start = P_FIRSTDATAKEY(opaque);
-
for (;;)
{
- /* see if it's on this page */
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ ItemId itemid;
+ BTItem item;
+
+ buf = _bt_getbuf(rel, blkno, access);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * start = InvalidOffsetNumber means "search the whole page".
+ * We need this test anyway due to possibility that
+ * page has a high key now when it didn't before.
+ */
+ if (start < minoff)
+ start = minoff;
+
+ /*
+ * These loops will check every item on the page --- but in an order
+ * that's attuned to the probability of where it actually is. Scan
+ * to the right first, then to the left.
+ */
for (offnum = start;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
}
}
+ for (offnum = OffsetNumberPrev(start);
+ offnum >= minoff;
+ offnum = OffsetNumberPrev(offnum))
+ {
+ itemid = PageGetItemId(page, offnum);
+ item = (BTItem) PageGetItem(page, itemid);
+ if (BTItemSame(item, &stack->bts_btitem))
+ {
+ /* Return accurate pointer to where link is now */
+ stack->bts_blkno = blkno;
+ stack->bts_offset = offnum;
+ return buf;
+ }
+ }
+
/*
- * by here, the item we're looking for moved right at least one
- * page
+ * The item we're looking for moved right at least one page.
*/
if (P_RIGHTMOST(opaque))
{
_bt_relbuf(rel, buf);
return (InvalidBuffer);
}
-
blkno = opaque->btpo_next;
+ start = InvalidOffsetNumber;
_bt_relbuf(rel, buf);
- buf = _bt_getbuf(rel, blkno, access);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- start = P_FIRSTDATAKEY(opaque);
}
}
Page metapg;
BTMetaPageData *metad;
+ lbkno = BufferGetBlockNumber(lbuf);
+ rbkno = BufferGetBlockNumber(rbuf);
+ lpage = BufferGetPage(lbuf);
+ rpage = BufferGetPage(rbuf);
+
/* get a new root page */
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
rootpage = BufferGetPage(rootbuf);
/* set btree special data */
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
- rootopaque->btpo_flags |= BTP_ROOT;
- rootopaque->btpo_parent = BTREE_METAPAGE;
+ rootopaque->btpo_flags = BTP_ROOT;
+ rootopaque->btpo.level =
+ ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
- lbkno = BufferGetBlockNumber(lbuf);
- rbkno = BufferGetBlockNumber(rbuf);
- lpage = BufferGetPage(lbuf);
- rpage = BufferGetPage(rbuf);
-
- /*
- * Make sure pages in old root level have valid parent links --- we
- * will need this in _bt_insertonpg() if a concurrent root split
- * happens (see README).
- */
- ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_parent =
- ((BTPageOpaque) PageGetSpecialPointer(rpage))->btpo_parent =
- rootblknum;
+ /* update metapage data */
+ metad->btm_root = rootblknum;
+ metad->btm_level = rootopaque->btpo.level;
+ metad->btm_fastroot = rootblknum;
+ metad->btm_fastlevel = rootopaque->btpo.level;
/*
* Create downlink item for left page (old root). Since this will be
elog(PANIC, "btree: failed to add rightkey to new root page");
pfree(new_item);
- metad->btm_root = rootblknum;
- (metad->btm_level)++;
-
/* XLOG stuff */
if (!rel->rd_istemp)
{
XLogRecData rdata[2];
xlrec.node = rel->rd_node;
+ xlrec.rootblk = rootblknum;
xlrec.level = metad->btm_level;
- BlockIdSet(&(xlrec.rootblk), rootblknum);
+
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeNewroot;
PageSetSUI(rootpage, ThisStartUpID);
PageSetLSN(metapg, recptr);
PageSetSUI(metapg, ThisStartUpID);
-
- /* we changed their btpo_parent */
PageSetLSN(lpage, recptr);
PageSetSUI(lpage, ThisStartUpID);
PageSetLSN(rpage, recptr);
return (rootbuf);
}
-/*
- * In the event old root page was splitted but no new one was created we
- * build required parent levels keeping write lock on old root page.
- * Note: it's assumed that old root page' btpo_parent points to meta page,
- * ie not to parent page. On exit, new root page buffer is write locked.
- * If "release" is TRUE then oldrootbuf will be released immediately
- * after upper level is builded.
- */
-Buffer
-_bt_fixroot(Relation rel, Buffer oldrootbuf, bool release)
-{
- Buffer rootbuf;
- BlockNumber rootblk;
- Page rootpage;
- XLogRecPtr rootLSN;
- Page oldrootpage = BufferGetPage(oldrootbuf);
- BTPageOpaque oldrootopaque = (BTPageOpaque)
- PageGetSpecialPointer(oldrootpage);
- Buffer buf,
- leftbuf,
- rightbuf;
- Page page,
- leftpage,
- rightpage;
- BTPageOpaque opaque,
- leftopaque,
- rightopaque;
- OffsetNumber newitemoff;
- BTItem btitem,
- ritem;
- Size itemsz;
-
- if (!P_LEFTMOST(oldrootopaque) || P_RIGHTMOST(oldrootopaque))
- elog(ERROR, "bt_fixroot: not valid old root page");
-
- /* Read right neighbor and create new root page */
- leftbuf = _bt_getbuf(rel, oldrootopaque->btpo_next, BT_WRITE);
- leftpage = BufferGetPage(leftbuf);
- leftopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
- rootbuf = _bt_newroot(rel, oldrootbuf, leftbuf);
- rootpage = BufferGetPage(rootbuf);
- rootLSN = PageGetLSN(rootpage);
- rootblk = BufferGetBlockNumber(rootbuf);
-
- /* parent page where to insert pointers */
- buf = rootbuf;
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /*
- * Now read other pages (if any) on level and add them to new root.
- * Here we break one of our locking rules - never hold lock on parent
- * page when acquiring lock on its child, - but we free from deadlock:
- *
- * If concurrent process will split one of pages on this level then it
- * will see either btpo_parent == metablock or btpo_parent == rootblk.
- * In first case it will give up its locks and walk to the leftmost
- * page (oldrootbuf) in _bt_fixup() - ie it will wait for us and let
- * us continue. In second case it will try to lock rootbuf keeping its
- * locks on buffers we already passed, also waiting for us. If we'll
- * have to unlock rootbuf (split it) and that process will have to
- * split page of new level we created (level of rootbuf) then it will
- * wait while we create upper level. Etc.
- */
- while (!P_RIGHTMOST(leftopaque))
- {
- rightbuf = _bt_getbuf(rel, leftopaque->btpo_next, BT_WRITE);
- rightpage = BufferGetPage(rightbuf);
- rightopaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
-
- /*
- * Update LSN & StartUpID of child page buffer to ensure that it
- * will be written on disk after flushing log record for new root
- * creation. Unfortunately, for the moment (?) we do not log this
- * operation and so possibly break our rule to log entire page
- * content on first after checkpoint modification.
- */
- HOLD_INTERRUPTS();
- rightopaque->btpo_parent = rootblk;
- if (XLByteLT(PageGetLSN(rightpage), rootLSN))
- PageSetLSN(rightpage, rootLSN);
- PageSetSUI(rightpage, ThisStartUpID);
- RESUME_INTERRUPTS();
-
- ritem = (BTItem) PageGetItem(leftpage, PageGetItemId(leftpage, P_HIKEY));
- btitem = _bt_formitem(&(ritem->bti_itup));
- ItemPointerSet(&(btitem->bti_itup.t_tid), leftopaque->btpo_next, P_HIKEY);
- itemsz = IndexTupleDSize(btitem->bti_itup)
- + (sizeof(BTItemData) - sizeof(IndexTupleData));
- itemsz = MAXALIGN(itemsz);
-
- newitemoff = OffsetNumberNext(PageGetMaxOffsetNumber(page));
-
- if (PageGetFreeSpace(page) < itemsz)
- {
- Buffer newbuf;
- OffsetNumber firstright;
- OffsetNumber itup_off;
- BlockNumber itup_blkno;
- bool newitemonleft;
-
- firstright = _bt_findsplitloc(rel, page,
- newitemoff, itemsz, &newitemonleft);
- newbuf = _bt_split(rel, buf, firstright,
- newitemoff, itemsz, btitem, newitemonleft,
- &itup_off, &itup_blkno);
- /* Keep lock on new "root" buffer ! */
- if (buf != rootbuf)
- _bt_relbuf(rel, buf);
- buf = newbuf;
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- }
- else
- _bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
-
- /* give up left buffer */
- _bt_wrtbuf(rel, leftbuf);
- pfree(btitem);
- leftbuf = rightbuf;
- leftpage = rightpage;
- leftopaque = rightopaque;
- }
-
- /* give up rightmost page buffer */
- _bt_wrtbuf(rel, leftbuf);
-
- /*
- * Here we hold locks on old root buffer, new root buffer we've
- * created with _bt_newroot() - rootbuf, - and buf we've used for last
- * insert ops - buf. If rootbuf != buf then we have to create at least
- * one more level. And if "release" is TRUE then we give up
- * oldrootbuf.
- */
- if (release)
- _bt_wrtbuf(rel, oldrootbuf);
-
- if (rootbuf != buf)
- {
- _bt_wrtbuf(rel, buf);
- return (_bt_fixroot(rel, rootbuf, true));
- }
-
- return (rootbuf);
-}
-
-/*
- * Using blkno of leftmost page on a level inside tree this func
- * checks/fixes tree from this level up to the root page.
- */
-static void
-_bt_fixtree(Relation rel, BlockNumber blkno)
-{
- Buffer buf;
- Page page;
- BTPageOpaque opaque;
- BlockNumber pblkno;
-
- for (;;)
- {
- buf = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (!P_LEFTMOST(opaque) || P_ISLEAF(opaque))
- elog(ERROR, "bt_fixtree[%s]: invalid start page (need to recreate index)", RelationGetRelationName(rel));
- pblkno = opaque->btpo_parent;
-
- /* check/fix entire level */
- _bt_fixlevel(rel, buf, InvalidBlockNumber);
-
- /*
- * No pins/locks are held here. Re-read start page if its
- * btpo_parent pointed to meta page else go up one level.
- *
- * XXX have to catch InvalidBlockNumber at the moment -:(
- */
- if (pblkno == BTREE_METAPAGE || pblkno == InvalidBlockNumber)
- {
- buf = _bt_getbuf(rel, blkno, BT_WRITE);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (P_ISROOT(opaque))
- {
- /* Tree is Ok now */
- _bt_relbuf(rel, buf);
- return;
- }
- /* Call _bt_fixroot() if there is no upper level */
- if (BTreeInvalidParent(opaque))
- {
- elog(WARNING, "bt_fixtree[%s]: fixing root page", RelationGetRelationName(rel));
- buf = _bt_fixroot(rel, buf, true);
- _bt_relbuf(rel, buf);
- return;
- }
- /* Have to go up one level */
- pblkno = opaque->btpo_parent;
- _bt_relbuf(rel, buf);
- }
- blkno = pblkno;
- }
-
-}
-
-/*
- * Check/fix level starting from page in buffer buf up to block
- * limit on *child* level (or till rightmost child page if limit
- * is InvalidBlockNumber). Start buffer must be read locked.
- * No pins/locks are held on exit.
- */
-static void
-_bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
-{
- BlockNumber blkno = BufferGetBlockNumber(buf);
- Page page;
- BTPageOpaque opaque;
- BlockNumber cblkno[3];
- OffsetNumber coff[3];
- Buffer cbuf[3];
- Page cpage[3];
- BTPageOpaque copaque[3];
- BTItem btitem;
- int cidx,
- i;
- bool goodbye = false;
- char tbuf[BLCKSZ];
-
- page = BufferGetPage(buf);
- /* copy page to temp storage */
- memmove(tbuf, page, PageGetPageSize(page));
- _bt_relbuf(rel, buf);
-
- page = (Page) tbuf;
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /* Initialize first child data */
- coff[0] = P_FIRSTDATAKEY(opaque);
- if (coff[0] > PageGetMaxOffsetNumber(page))
- elog(ERROR, "bt_fixlevel[%s]: invalid maxoff on start page (need to recreate index)", RelationGetRelationName(rel));
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, coff[0]));
- cblkno[0] = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid));
- cbuf[0] = _bt_getbuf(rel, cblkno[0], BT_READ);
- cpage[0] = BufferGetPage(cbuf[0]);
- copaque[0] = (BTPageOpaque) PageGetSpecialPointer(cpage[0]);
- if (P_LEFTMOST(opaque) && !P_LEFTMOST(copaque[0]))
- elog(ERROR, "bt_fixtlevel[%s]: non-leftmost child page of leftmost parent (need to recreate index)", RelationGetRelationName(rel));
- /* caller should take care and avoid this */
- if (P_RIGHTMOST(copaque[0]))
- elog(ERROR, "bt_fixtlevel[%s]: invalid start child (need to recreate index)", RelationGetRelationName(rel));
-
- for (;;)
- {
- /*
- * Read up to 2 more child pages and look for pointers to them in
- * *saved* parent page
- */
- coff[1] = coff[2] = InvalidOffsetNumber;
- for (cidx = 0; cidx < 2;)
- {
- cidx++;
- cblkno[cidx] = (copaque[cidx - 1])->btpo_next;
- cbuf[cidx] = _bt_getbuf(rel, cblkno[cidx], BT_READ);
- cpage[cidx] = BufferGetPage(cbuf[cidx]);
- copaque[cidx] = (BTPageOpaque) PageGetSpecialPointer(cpage[cidx]);
- coff[cidx] = _bt_getoff(page, cblkno[cidx]);
-
- /* sanity check */
- if (coff[cidx] != InvalidOffsetNumber)
- {
- for (i = cidx - 1; i >= 0; i--)
- {
- if (coff[i] == InvalidOffsetNumber)
- continue;
- if (coff[cidx] != coff[i] + 1)
- elog(ERROR, "bt_fixlevel[%s]: invalid item order(1) (need to recreate index)", RelationGetRelationName(rel));
- break;
- }
- }
-
- if (P_RIGHTMOST(copaque[cidx]))
- break;
- }
-
- /*
- * Read parent page and insert missed pointers.
- */
- if (coff[1] == InvalidOffsetNumber ||
- (cidx == 2 && coff[2] == InvalidOffsetNumber))
- {
- Buffer newbuf;
- Page newpage;
- BTPageOpaque newopaque;
- BTItem ritem;
- Size itemsz;
- OffsetNumber newitemoff;
- BlockNumber parblk[3];
- BTStackData stack;
-
- stack.bts_parent = NULL;
- stack.bts_blkno = blkno;
- stack.bts_offset = InvalidOffsetNumber;
- ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid),
- cblkno[0], P_HIKEY);
-
- buf = _bt_getstackbuf(rel, &stack, BT_WRITE);
- if (buf == InvalidBuffer)
- elog(ERROR, "bt_fixlevel[%s]: pointer disappeared (need to recreate index)", RelationGetRelationName(rel));
-
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- coff[0] = stack.bts_offset;
- blkno = BufferGetBlockNumber(buf);
- parblk[0] = blkno;
-
- /* Check/insert missed pointers */
- for (i = 1; i <= cidx; i++)
- {
- coff[i] = _bt_getoff(page, cblkno[i]);
-
- /* sanity check */
- parblk[i] = BufferGetBlockNumber(buf);
- if (coff[i] != InvalidOffsetNumber)
- {
- if (parblk[i] == parblk[i - 1] &&
- coff[i] != coff[i - 1] + 1)
- elog(ERROR, "bt_fixlevel[%s]: invalid item order(2) (need to recreate index)", RelationGetRelationName(rel));
- continue;
- }
- /* Have to check next page ? */
- if ((!P_RIGHTMOST(opaque)) &&
- coff[i - 1] == PageGetMaxOffsetNumber(page)) /* yes */
- {
- newbuf = _bt_getbuf(rel, opaque->btpo_next, BT_WRITE);
- newpage = BufferGetPage(newbuf);
- newopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
- coff[i] = _bt_getoff(newpage, cblkno[i]);
- if (coff[i] != InvalidOffsetNumber) /* found ! */
- {
- if (coff[i] != P_FIRSTDATAKEY(newopaque))
- elog(ERROR, "bt_fixlevel[%s]: invalid item order(3) (need to recreate index)", RelationGetRelationName(rel));
- _bt_relbuf(rel, buf);
- buf = newbuf;
- page = newpage;
- opaque = newopaque;
- blkno = BufferGetBlockNumber(buf);
- parblk[i] = blkno;
- continue;
- }
- /* unfound - need to insert on current page */
- _bt_relbuf(rel, newbuf);
- }
- /* insert pointer */
- ritem = (BTItem) PageGetItem(cpage[i - 1],
- PageGetItemId(cpage[i - 1], P_HIKEY));
- btitem = _bt_formitem(&(ritem->bti_itup));
- ItemPointerSet(&(btitem->bti_itup.t_tid), cblkno[i], P_HIKEY);
- itemsz = IndexTupleDSize(btitem->bti_itup)
- + (sizeof(BTItemData) - sizeof(IndexTupleData));
- itemsz = MAXALIGN(itemsz);
-
- newitemoff = coff[i - 1] + 1;
-
- if (PageGetFreeSpace(page) < itemsz)
- {
- OffsetNumber firstright;
- OffsetNumber itup_off;
- BlockNumber itup_blkno;
- bool newitemonleft;
-
- firstright = _bt_findsplitloc(rel, page,
- newitemoff, itemsz, &newitemonleft);
- newbuf = _bt_split(rel, buf, firstright,
- newitemoff, itemsz, btitem, newitemonleft,
- &itup_off, &itup_blkno);
- /* what buffer we need in ? */
- if (newitemonleft)
- _bt_relbuf(rel, newbuf);
- else
- {
- _bt_relbuf(rel, buf);
- buf = newbuf;
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- }
- blkno = BufferGetBlockNumber(buf);
- coff[i] = itup_off;
- }
- else
- {
- _bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
- coff[i] = newitemoff;
- }
-
- pfree(btitem);
- parblk[i] = blkno;
- }
-
- /* copy page with pointer to cblkno[cidx] to temp storage */
- memmove(tbuf, page, PageGetPageSize(page));
- _bt_relbuf(rel, buf);
- page = (Page) tbuf;
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- }
-
- /* Continue if current check/fix level page is rightmost */
- if (P_RIGHTMOST(opaque))
- goodbye = false;
-
- /* Pointers to child pages are Ok - right end of child level ? */
- _bt_relbuf(rel, cbuf[0]);
- _bt_relbuf(rel, cbuf[1]);
- if (cidx == 1 ||
- (cidx == 2 && (P_RIGHTMOST(copaque[2]) || goodbye)))
- {
- if (cidx == 2)
- _bt_relbuf(rel, cbuf[2]);
- return;
- }
- if (cblkno[0] == limit || cblkno[1] == limit)
- goodbye = true;
- cblkno[0] = cblkno[2];
- cbuf[0] = cbuf[2];
- cpage[0] = cpage[2];
- copaque[0] = copaque[2];
- coff[0] = coff[2];
- }
-}
-
-/*
- * Check/fix part of tree - branch - up from parent of level with blocks
- * lblkno and rblknum. We first ensure that parent level has pointers
- * to both lblkno & rblknum and if those pointers are on different
- * parent pages then do the same for parent level, etc. No locks must
- * be held on target level and upper on entry. No locks will be held
- * on exit. Stack created when traversing tree down should be provided and
- * it must points to parent level. rblkno must be on the right from lblkno.
- * (This function is special edition of more expensive _bt_fixtree(),
- * but it doesn't guarantee full consistency of tree.)
- */
-static void
-_bt_fixbranch(Relation rel, BlockNumber lblkno,
- BlockNumber rblkno, BTStack true_stack)
-{
- BlockNumber blkno = true_stack->bts_blkno;
- BTStackData stack;
- BTPageOpaque opaque;
- Buffer buf,
- rbuf;
- Page page;
- OffsetNumber offnum;
-
- true_stack = true_stack->bts_parent;
- for (;;)
- {
- buf = _bt_getbuf(rel, blkno, BT_READ);
-
- /* Check/fix parent level pointed by blkno */
- _bt_fixlevel(rel, buf, rblkno);
-
- /*
- * Here parent level should have pointers for both lblkno and
- * rblkno and we have to find them.
- */
- stack.bts_parent = NULL;
- stack.bts_blkno = blkno;
- stack.bts_offset = InvalidOffsetNumber;
- ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), lblkno, P_HIKEY);
- buf = _bt_getstackbuf(rel, &stack, BT_READ);
- if (buf == InvalidBuffer)
- elog(ERROR, "bt_fixbranch[%s]: left pointer unfound (need to recreate index)", RelationGetRelationName(rel));
- page = BufferGetPage(buf);
- offnum = _bt_getoff(page, rblkno);
-
- if (offnum != InvalidOffsetNumber) /* right pointer found */
- {
- if (offnum <= stack.bts_offset)
- elog(ERROR, "bt_fixbranch[%s]: invalid item order (need to recreate index)", RelationGetRelationName(rel));
- _bt_relbuf(rel, buf);
- return;
- }
-
- /* Pointers are on different parent pages - find right one */
- lblkno = BufferGetBlockNumber(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (P_RIGHTMOST(opaque))
- elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(1) (need to recreate index)", RelationGetRelationName(rel));
-
- stack.bts_parent = NULL;
- stack.bts_blkno = opaque->btpo_next;
- stack.bts_offset = InvalidOffsetNumber;
- ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), rblkno, P_HIKEY);
- rbuf = _bt_getstackbuf(rel, &stack, BT_READ);
- if (rbuf == InvalidBuffer)
- elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(2) (need to recreate index)", RelationGetRelationName(rel));
- rblkno = BufferGetBlockNumber(rbuf);
- _bt_relbuf(rel, rbuf);
-
- /*
- * If we have parent item in true_stack then go up one level and
- * ensure that it has pointers to new lblkno & rblkno.
- */
- if (true_stack)
- {
- _bt_relbuf(rel, buf);
- blkno = true_stack->bts_blkno;
- true_stack = true_stack->bts_parent;
- continue;
- }
-
- /*
- * Well, we are on the level that was root or unexistent when we
- * started traversing tree down. If btpo_parent is updated then
- * we'll use it to continue, else we'll fix/restore upper levels
- * entirely.
- */
- if (!BTreeInvalidParent(opaque))
- {
- blkno = opaque->btpo_parent;
- _bt_relbuf(rel, buf);
- continue;
- }
-
- /* Have to switch to excl buf lock and re-check btpo_parent */
- _bt_relbuf(rel, buf);
- buf = _bt_getbuf(rel, blkno, BT_WRITE);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (!BTreeInvalidParent(opaque))
- {
- blkno = opaque->btpo_parent;
- _bt_relbuf(rel, buf);
- continue;
- }
-
- /*
- * We hold excl lock on some internal page with unupdated
- * btpo_parent - time for _bt_fixup.
- */
- break;
- }
-
- elog(WARNING, "bt_fixbranch[%s]: fixing upper levels", RelationGetRelationName(rel));
- _bt_fixup(rel, buf);
-
- return;
-}
-
-/*
- * Having buf excl locked this routine walks to the left on level and
- * uses either _bt_fixtree() or _bt_fixroot() to create/check&fix upper
- * levels. No buffer pins/locks will be held on exit.
- */
-static void
-_bt_fixup(Relation rel, Buffer buf)
-{
- Page page;
- BTPageOpaque opaque;
- BlockNumber blkno;
-
- for (;;)
- {
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /*
- * If someone else already created parent pages then it's time for
- * _bt_fixtree() to check upper levels and fix them, if required.
- */
- if (!BTreeInvalidParent(opaque))
- {
- blkno = opaque->btpo_parent;
- _bt_relbuf(rel, buf);
- elog(WARNING, "bt_fixup[%s]: checking/fixing upper levels", RelationGetRelationName(rel));
- _bt_fixtree(rel, blkno);
- return;
- }
- if (P_LEFTMOST(opaque))
- break;
- blkno = opaque->btpo_prev;
- _bt_relbuf(rel, buf);
- buf = _bt_getbuf(rel, blkno, BT_WRITE);
- }
-
- /*
- * Ok, we are on the leftmost page, it's write locked by us and its
- * btpo_parent points to meta page - time for _bt_fixroot().
- */
- elog(WARNING, "bt_fixup[%s]: fixing root page", RelationGetRelationName(rel));
- buf = _bt_fixroot(rel, buf, true);
- _bt_relbuf(rel, buf);
-}
-
-static OffsetNumber
-_bt_getoff(Page page, BlockNumber blkno)
-{
- BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
- OffsetNumber offnum = P_FIRSTDATAKEY(opaque);
- BlockNumber curblkno;
- ItemId itemid;
- BTItem item;
-
- for (; offnum <= maxoff; offnum++)
- {
- itemid = PageGetItemId(page, offnum);
- item = (BTItem) PageGetItem(page, itemid);
- curblkno = ItemPointerGetBlockNumber(&(item->bti_itup.t_tid));
- if (curblkno == blkno)
- return (offnum);
- }
-
- return (InvalidOffsetNumber);
-}
-
/*
* _bt_pgaddtup() -- add a tuple to a particular page in the index.
*
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.58 2002/08/06 02:36:33 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
+
/*
- * _bt_metapinit() -- Initialize the metadata page of a btree.
+ * _bt_metapinit() -- Initialize the metadata page of a new btree.
*/
void
_bt_metapinit(Relation rel)
{
Buffer buf;
Page pg;
- BTMetaPageData metad;
+ BTMetaPageData *metad;
BTPageOpaque op;
/* can't be sharing this with anyone, now... */
RelationGetRelationName(rel));
buf = ReadBuffer(rel, P_NEW);
+ Assert(BufferGetBlockNumber(buf) == BTREE_METAPAGE);
pg = BufferGetPage(buf);
+
+ /* NO ELOG(ERROR) from here till newmeta op is logged */
+ START_CRIT_SECTION();
+
_bt_pageinit(pg, BufferGetPageSize(buf));
- metad.btm_magic = BTREE_MAGIC;
- metad.btm_version = BTREE_VERSION;
- metad.btm_root = P_NONE;
- metad.btm_level = 0;
- memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
+ metad = BTPageGetMeta(pg);
+ metad->btm_magic = BTREE_MAGIC;
+ metad->btm_version = BTREE_VERSION;
+ metad->btm_root = P_NONE;
+ metad->btm_level = 0;
+ metad->btm_fastroot = P_NONE;
+ metad->btm_fastlevel = 0;
op = (BTPageOpaque) PageGetSpecialPointer(pg);
op->btpo_flags = BTP_META;
+ /* XLOG stuff */
+ if (!rel->rd_istemp)
+ {
+ xl_btree_newmeta xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+ xlrec.node = rel->rd_node;
+ xlrec.meta.root = metad->btm_root;
+ xlrec.meta.level = metad->btm_level;
+ xlrec.meta.fastroot = metad->btm_fastroot;
+ xlrec.meta.fastlevel = metad->btm_fastlevel;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBtreeNewmeta;
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
+
+ PageSetLSN(pg, recptr);
+ PageSetSUI(pg, ThisStartUpID);
+ }
+
+ END_CRIT_SECTION();
+
WriteBuffer(buf);
/* all done */
* NOTE that the returned root page will have only a read lock set
* on it even if access = BT_WRITE!
*
+ * The returned page is not necessarily the true root --- it could be
+ * a "fast root" (a page that is alone in its level due to deletions).
+ * Also, if the root page is split while we are "in flight" to it,
+ * what we will return is the old root, which is now just the leftmost
+ * page on a probably-not-very-wide level. For most purposes this is
+ * as good as or better than the true root, so we do not bother to
+ * insist on finding the true root.
+ *
* On successful return, the root page is pinned and read-locked.
* The metadata page is not locked or pinned on exit.
*/
rootblkno = BufferGetBlockNumber(rootbuf);
rootpage = BufferGetPage(rootbuf);
+ _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+ rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
+ rootopaque->btpo.level = 0;
+
/* NO ELOG(ERROR) till meta is updated */
START_CRIT_SECTION();
metad->btm_root = rootblkno;
- metad->btm_level = 1;
-
- _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
- rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
+ metad->btm_level = 0;
+ metad->btm_fastroot = rootblkno;
+ metad->btm_fastlevel = 0;
/* XLOG stuff */
if (!rel->rd_istemp)
XLogRecData rdata;
xlrec.node = rel->rd_node;
- xlrec.level = 1;
- BlockIdSet(&(xlrec.rootblk), rootblkno);
+ xlrec.rootblk = rootblkno;
+ xlrec.level = 0;
+
rdata.buffer = InvalidBuffer;
rdata.data = (char *) &xlrec;
rdata.len = SizeOfBtreeNewroot;
rdata.next = NULL;
- recptr = XLogInsert(RM_BTREE_ID,
- XLOG_BTREE_NEWROOT | XLOG_BTREE_LEAF,
- &rdata);
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
PageSetLSN(rootpage, recptr);
PageSetSUI(rootpage, ThisStartUpID);
_bt_wrtnorelbuf(rel, rootbuf);
- /* swap write lock for read lock */
+ /*
+ * swap root write lock for read lock. There is no danger of
+ * anyone else accessing the new root page while it's unlocked,
+ * since no one else knows where it is yet.
+ */
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(rootbuf, BT_READ);
}
else
{
- rootblkno = metad->btm_root;
+ rootblkno = metad->btm_fastroot;
+
_bt_relbuf(rel, metabuf); /* done with the meta page */
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
}
/*
- * Race condition: If the root page split between the time we looked
- * at the metadata page and got the root buffer, then we got the wrong
- * buffer. Release it and try again.
+ * By here, we have a pin and read lock on the root page, and no
+ * lock set on the metadata page. Return the root page's buffer.
*/
- rootpage = BufferGetPage(rootbuf);
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ return rootbuf;
+}
- if (!P_ISROOT(rootopaque))
- {
- /*
- * It happened, but if root page splitter failed to create new
- * root page then we'll go in loop trying to call _bt_getroot
- * again and again.
- */
- if (FixBTree)
- {
- Buffer newrootbuf;
+/*
+ * _bt_gettrueroot() -- Get the true root page of the btree.
+ *
+ * This is the same as the BT_READ case of _bt_getroot(), except
+ * we follow the true-root link not the fast-root link.
+ *
+ * By the time we acquire lock on the root page, it might have been split and
+ * not be the true root anymore. This is okay for the present uses of this
+ * routine; we only really need to be able to move up at least one tree level
+ * from whatever non-root page we were at. If we ever do need to lock the
+ * one true root page, we could loop here, re-reading the metapage on each
+ * failure. (Note that it wouldn't do to hold the lock on the metapage while
+ * moving to the root --- that'd deadlock against any concurrent root split.)
+ */
+Buffer
+_bt_gettrueroot(Relation rel)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTPageOpaque metaopaque;
+ Buffer rootbuf;
+ BlockNumber rootblkno;
+ BTMetaPageData *metad;
- check_parent:;
- if (BTreeInvalidParent(rootopaque)) /* unupdated! */
- {
- LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(rootbuf, BT_WRITE);
-
- /* handle concurrent fix of root page */
- if (BTreeInvalidParent(rootopaque)) /* unupdated! */
- {
- elog(WARNING, "bt_getroot[%s]: fixing root page", RelationGetRelationName(rel));
- newrootbuf = _bt_fixroot(rel, rootbuf, true);
- LockBuffer(newrootbuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(newrootbuf, BT_READ);
- rootbuf = newrootbuf;
- rootpage = BufferGetPage(rootbuf);
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
- /* New root might be splitted while changing lock */
- if (P_ISROOT(rootopaque))
- return (rootbuf);
- /* rootbuf is read locked */
- goto check_parent;
- }
- else
- {
- /* someone else already fixed root */
- LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(rootbuf, BT_READ);
- }
- }
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ metad = BTPageGetMeta(metapg);
- /*
- * Ok, here we have old root page with btpo_parent pointing to
- * upper level - check parent page because of there is good
- * chance that parent is root page.
- */
- newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
- _bt_relbuf(rel, rootbuf);
- rootbuf = newrootbuf;
- rootpage = BufferGetPage(rootbuf);
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
- if (P_ISROOT(rootopaque))
- return (rootbuf);
- /* no luck -:( */
- }
+ if (!(metaopaque->btpo_flags & BTP_META) ||
+ metad->btm_magic != BTREE_MAGIC)
+ elog(ERROR, "Index %s is not a btree",
+ RelationGetRelationName(rel));
+
+ if (metad->btm_version != BTREE_VERSION)
+ elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION);
- /* try again */
- _bt_relbuf(rel, rootbuf);
- return _bt_getroot(rel, access);
+ /* if no root page initialized yet, fail */
+ if (metad->btm_root == P_NONE)
+ {
+ _bt_relbuf(rel, metabuf);
+ return InvalidBuffer;
}
- /*
- * By here, we have a correct lock on the root block, its reference
- * count is correct, and we have no lock set on the metadata page.
- * Return the root block.
- */
+ rootblkno = metad->btm_root;
+
+ _bt_relbuf(rel, metabuf); /* done with the meta page */
+
+ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+
return rootbuf;
}
/*
* _bt_pageinit() -- Initialize a new page.
+ *
+ * On return, the page header is initialized; data space is empty;
+ * special space is zeroed out.
*/
void
_bt_pageinit(Page page, Size size)
{
PageInit(page, size, sizeof(BTPageOpaqueData));
- ((BTPageOpaque) PageGetSpecialPointer(page))->btpo_parent =
- InvalidBlockNumber;
}
/*
* at least the old root page when you call this, you're making a big
* mistake. On exit, metapage data is correct and we no longer have
* a pin or lock on the metapage.
+ *
+ * XXX this is not used for splitting anymore, only in nbtsort.c at the
+ * completion of btree building.
*/
void
-_bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
+_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
{
Buffer metabuf;
Page metap;
metap = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
Assert(metaopaque->btpo_flags & BTP_META);
+
+ /* NO ELOG(ERROR) from here till newmeta op is logged */
+ START_CRIT_SECTION();
+
metad = BTPageGetMeta(metap);
metad->btm_root = rootbknum;
- if (level == 0) /* called from _do_insert */
- metad->btm_level += 1;
- else
- metad->btm_level = level; /* called from btsort */
+ metad->btm_level = level;
+ metad->btm_fastroot = rootbknum;
+ metad->btm_fastlevel = level;
+
+ /* XLOG stuff */
+ if (!rel->rd_istemp)
+ {
+ xl_btree_newmeta xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[1];
+
+ xlrec.node = rel->rd_node;
+ xlrec.meta.root = metad->btm_root;
+ xlrec.meta.level = metad->btm_level;
+ xlrec.meta.fastroot = metad->btm_fastroot;
+ xlrec.meta.fastlevel = metad->btm_fastlevel;
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBtreeNewmeta;
+ rdata[0].next = NULL;
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
+
+ PageSetLSN(metap, recptr);
+ PageSetSUI(metap, ThisStartUpID);
+ }
+
+ END_CRIT_SECTION();
+
_bt_wrtbuf(rel, metabuf);
}
xlrec.target.node = rel->rd_node;
xlrec.target.tid = *tid;
+
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeDelete;
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.94 2002/11/15 01:26:08 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
-
#include "postgres.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/nbtree.h"
#include "catalog/index.h"
-#include "executor/executor.h"
#include "miscadmin.h"
-#include "storage/sinval.h"
-#include "access/xlogutils.h"
/* Working state for btbuild and its callback */
ItemPointerSet(current, blkno, offnum);
}
}
-
-static void
-_bt_restore_page(Page page, char *from, int len)
-{
- BTItemData btdata;
- Size itemsz;
- char *end = from + len;
-
- for (; from < end;)
- {
- memcpy(&btdata, from, sizeof(BTItemData));
- itemsz = IndexTupleDSize(btdata.bti_itup) +
- (sizeof(BTItemData) - sizeof(IndexTupleData));
- itemsz = MAXALIGN(itemsz);
- if (PageAddItem(page, (Item) from, itemsz,
- FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
- elog(PANIC, "_bt_restore_page: can't add item to page");
- from += itemsz;
- }
-}
-
-static void
-btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
-{
- xl_btree_delete *xlrec;
- Relation reln;
- Buffer buffer;
- Page page;
-
- if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
- return;
-
- xlrec = (xl_btree_delete *) XLogRecGetData(record);
- reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
- if (!RelationIsValid(reln))
- return;
- buffer = XLogReadBuffer(false, reln,
- ItemPointerGetBlockNumber(&(xlrec->target.tid)));
- if (!BufferIsValid(buffer))
- elog(PANIC, "btree_delete_redo: block unfound");
- page = (Page) BufferGetPage(buffer);
- if (PageIsNew((PageHeader) page))
- elog(PANIC, "btree_delete_redo: uninitialized page");
-
- if (XLByteLE(lsn, PageGetLSN(page)))
- {
- UnlockAndReleaseBuffer(buffer);
- return;
- }
-
- PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
-
- PageSetLSN(page, lsn);
- PageSetSUI(page, ThisStartUpID);
- UnlockAndWriteBuffer(buffer);
-
- return;
-}
-
-static void
-btree_xlog_insert(bool redo, XLogRecPtr lsn, XLogRecord *record)
-{
- xl_btree_insert *xlrec;
- Relation reln;
- Buffer buffer;
- Page page;
- BTPageOpaque pageop;
-
- if (redo && (record->xl_info & XLR_BKP_BLOCK_1))
- return;
-
- xlrec = (xl_btree_insert *) XLogRecGetData(record);
- reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
- if (!RelationIsValid(reln))
- return;
- buffer = XLogReadBuffer(false, reln,
- ItemPointerGetBlockNumber(&(xlrec->target.tid)));
- if (!BufferIsValid(buffer))
- elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un");
- page = (Page) BufferGetPage(buffer);
- if (PageIsNew((PageHeader) page))
- elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un");
- pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- if (redo)
- {
- if (XLByteLE(lsn, PageGetLSN(page)))
- {
- UnlockAndReleaseBuffer(buffer);
- return;
- }
- if (PageAddItem(page, (Item) ((char *) xlrec + SizeOfBtreeInsert),
- record->xl_len - SizeOfBtreeInsert,
- ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
- LP_USED) == InvalidOffsetNumber)
- elog(PANIC, "btree_insert_redo: failed to add item");
-
- PageSetLSN(page, lsn);
- PageSetSUI(page, ThisStartUpID);
- UnlockAndWriteBuffer(buffer);
- }
- else
- {
- if (XLByteLT(PageGetLSN(page), lsn))
- elog(PANIC, "btree_insert_undo: bad page LSN");
-
- if (!P_ISLEAF(pageop))
- {
- UnlockAndReleaseBuffer(buffer);
- return;
- }
-
- elog(PANIC, "btree_insert_undo: unimplemented");
- }
-
- return;
-}
-
-static void
-btree_xlog_split(bool redo, bool onleft, XLogRecPtr lsn, XLogRecord *record)
-{
- xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
- Relation reln;
- BlockNumber blkno;
- Buffer buffer;
- Page page;
- BTPageOpaque pageop;
- char *op = (redo) ? "redo" : "undo";
- bool isleaf = (record->xl_info & XLOG_BTREE_LEAF);
-
- reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
- if (!RelationIsValid(reln))
- return;
-
- /* Left (original) sibling */
- blkno = (onleft) ? ItemPointerGetBlockNumber(&(xlrec->target.tid)) :
- BlockIdGetBlockNumber(&(xlrec->otherblk));
- buffer = XLogReadBuffer(false, reln, blkno);
- if (!BufferIsValid(buffer))
- elog(PANIC, "btree_split_%s: lost left sibling", op);
-
- page = (Page) BufferGetPage(buffer);
- if (redo)
- _bt_pageinit(page, BufferGetPageSize(buffer));
- else if (PageIsNew((PageHeader) page))
- elog(PANIC, "btree_split_undo: uninitialized left sibling");
- pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- if (redo)
- {
- pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk));
- pageop->btpo_prev = BlockIdGetBlockNumber(&(xlrec->leftblk));
- if (onleft)
- pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->otherblk));
- else
- pageop->btpo_next = ItemPointerGetBlockNumber(&(xlrec->target.tid));
- pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0;
-
- _bt_restore_page(page, (char *) xlrec + SizeOfBtreeSplit, xlrec->leftlen);
-
- PageSetLSN(page, lsn);
- PageSetSUI(page, ThisStartUpID);
- UnlockAndWriteBuffer(buffer);
- }
- else
-/* undo */
- {
- if (XLByteLT(PageGetLSN(page), lsn))
- elog(PANIC, "btree_split_undo: bad left sibling LSN");
- elog(PANIC, "btree_split_undo: unimplemented");
- }
-
- /* Right (new) sibling */
- blkno = (onleft) ? BlockIdGetBlockNumber(&(xlrec->otherblk)) :
- ItemPointerGetBlockNumber(&(xlrec->target.tid));
- buffer = XLogReadBuffer((redo) ? true : false, reln, blkno);
- if (!BufferIsValid(buffer))
- elog(PANIC, "btree_split_%s: lost right sibling", op);
-
- page = (Page) BufferGetPage(buffer);
- if (redo)
- _bt_pageinit(page, BufferGetPageSize(buffer));
- else if (PageIsNew((PageHeader) page))
- elog(PANIC, "btree_split_undo: uninitialized right sibling");
- pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- if (redo)
- {
- pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk));
- pageop->btpo_prev = (onleft) ?
- ItemPointerGetBlockNumber(&(xlrec->target.tid)) :
- BlockIdGetBlockNumber(&(xlrec->otherblk));
- pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->rightblk));
- pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0;
-
- _bt_restore_page(page,
- (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
- record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);
-
- PageSetLSN(page, lsn);
- PageSetSUI(page, ThisStartUpID);
- UnlockAndWriteBuffer(buffer);
- }
- else
-/* undo */
- {
- if (XLByteLT(PageGetLSN(page), lsn))
- elog(PANIC, "btree_split_undo: bad right sibling LSN");
- elog(PANIC, "btree_split_undo: unimplemented");
- }
-
- if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
- return;
-
- /* Right (next) page */
- blkno = BlockIdGetBlockNumber(&(xlrec->rightblk));
- if (blkno == P_NONE)
- return;
-
- buffer = XLogReadBuffer(false, reln, blkno);
- if (!BufferIsValid(buffer))
- elog(PANIC, "btree_split_redo: lost next right page");
-
- page = (Page) BufferGetPage(buffer);
- if (PageIsNew((PageHeader) page))
- elog(PANIC, "btree_split_redo: uninitialized next right page");
-
- if (XLByteLE(lsn, PageGetLSN(page)))
- {
- UnlockAndReleaseBuffer(buffer);
- return;
- }
- pageop = (BTPageOpaque) PageGetSpecialPointer(page);
- pageop->btpo_prev = (onleft) ?
- BlockIdGetBlockNumber(&(xlrec->otherblk)) :
- ItemPointerGetBlockNumber(&(xlrec->target.tid));
-
- PageSetLSN(page, lsn);
- PageSetSUI(page, ThisStartUpID);
- UnlockAndWriteBuffer(buffer);
-}
-
-static void
-btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record)
-{
- xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
- Relation reln;
- Buffer buffer;
- Page page;
- BTPageOpaque pageop;
- Buffer metabuf;
- Page metapg;
- BTMetaPageData md;
-
- if (!redo)
- return;
-
- reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
- if (!RelationIsValid(reln))
- return;
- buffer = XLogReadBuffer(true, reln, BlockIdGetBlockNumber(&(xlrec->rootblk)));
- if (!BufferIsValid(buffer))
- elog(PANIC, "btree_newroot_redo: no root page");
- metabuf = XLogReadBuffer(false, reln, BTREE_METAPAGE);
- if (!BufferIsValid(buffer))
- elog(PANIC, "btree_newroot_redo: no metapage");
- page = (Page) BufferGetPage(buffer);
- _bt_pageinit(page, BufferGetPageSize(buffer));
- pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- pageop->btpo_flags |= BTP_ROOT;
- pageop->btpo_prev = pageop->btpo_next = P_NONE;
- pageop->btpo_parent = BTREE_METAPAGE;
-
- if (record->xl_info & XLOG_BTREE_LEAF)
- pageop->btpo_flags |= BTP_LEAF;
-
- if (record->xl_len > SizeOfBtreeNewroot)
- _bt_restore_page(page,
- (char *) xlrec + SizeOfBtreeNewroot,
- record->xl_len - SizeOfBtreeNewroot);
-
- PageSetLSN(page, lsn);
- PageSetSUI(page, ThisStartUpID);
- UnlockAndWriteBuffer(buffer);
-
- metapg = BufferGetPage(metabuf);
- _bt_pageinit(metapg, BufferGetPageSize(metabuf));
- md.btm_magic = BTREE_MAGIC;
- md.btm_version = BTREE_VERSION;
- md.btm_root = BlockIdGetBlockNumber(&(xlrec->rootblk));
- md.btm_level = xlrec->level;
- memcpy((char *) BTPageGetMeta(metapg), (char *) &md, sizeof(md));
-
- pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
- pageop->btpo_flags = BTP_META;
-
- PageSetLSN(metapg, lsn);
- PageSetSUI(metapg, ThisStartUpID);
- UnlockAndWriteBuffer(metabuf);
-}
-
-void
-btree_redo(XLogRecPtr lsn, XLogRecord *record)
-{
- uint8 info = record->xl_info & ~XLR_INFO_MASK;
-
- info &= ~XLOG_BTREE_LEAF;
- if (info == XLOG_BTREE_DELETE)
- btree_xlog_delete(true, lsn, record);
- else if (info == XLOG_BTREE_INSERT)
- btree_xlog_insert(true, lsn, record);
- else if (info == XLOG_BTREE_SPLIT)
- btree_xlog_split(true, false, lsn, record); /* new item on the right */
- else if (info == XLOG_BTREE_SPLEFT)
- btree_xlog_split(true, true, lsn, record); /* new item on the left */
- else if (info == XLOG_BTREE_NEWROOT)
- btree_xlog_newroot(true, lsn, record);
- else
- elog(PANIC, "btree_redo: unknown op code %u", info);
-}
-
-void
-btree_undo(XLogRecPtr lsn, XLogRecord *record)
-{
- uint8 info = record->xl_info & ~XLR_INFO_MASK;
-
- info &= ~XLOG_BTREE_LEAF;
- if (info == XLOG_BTREE_DELETE)
- btree_xlog_delete(false, lsn, record);
- else if (info == XLOG_BTREE_INSERT)
- btree_xlog_insert(false, lsn, record);
- else if (info == XLOG_BTREE_SPLIT)
- btree_xlog_split(false, false, lsn, record); /* new item on the right */
- else if (info == XLOG_BTREE_SPLEFT)
- btree_xlog_split(false, true, lsn, record); /* new item on the left */
- else if (info == XLOG_BTREE_NEWROOT)
- btree_xlog_newroot(false, lsn, record);
- else
- elog(PANIC, "btree_undo: unknown op code %u", info);
-}
-
-static void
-out_target(char *buf, xl_btreetid *target)
-{
- sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u",
- target->node.tblNode, target->node.relNode,
- ItemPointerGetBlockNumber(&(target->tid)),
- ItemPointerGetOffsetNumber(&(target->tid)));
-}
-
-void
-btree_desc(char *buf, uint8 xl_info, char *rec)
-{
- uint8 info = xl_info & ~XLR_INFO_MASK;
-
- info &= ~XLOG_BTREE_LEAF;
- if (info == XLOG_BTREE_INSERT)
- {
- xl_btree_insert *xlrec = (xl_btree_insert *) rec;
-
- strcat(buf, "insert: ");
- out_target(buf, &(xlrec->target));
- }
- else if (info == XLOG_BTREE_DELETE)
- {
- xl_btree_delete *xlrec = (xl_btree_delete *) rec;
-
- strcat(buf, "delete: ");
- out_target(buf, &(xlrec->target));
- }
- else if (info == XLOG_BTREE_SPLIT || info == XLOG_BTREE_SPLEFT)
- {
- xl_btree_split *xlrec = (xl_btree_split *) rec;
-
- sprintf(buf + strlen(buf), "split(%s): ",
- (info == XLOG_BTREE_SPLIT) ? "right" : "left");
- out_target(buf, &(xlrec->target));
- sprintf(buf + strlen(buf), "; oth %u; rgh %u",
- BlockIdGetBlockNumber(&xlrec->otherblk),
- BlockIdGetBlockNumber(&xlrec->rightblk));
- }
- else if (info == XLOG_BTREE_NEWROOT)
- {
- xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;
-
- sprintf(buf + strlen(buf), "root: node %u/%u; blk %u",
- xlrec->node.tblNode, xlrec->node.relNode,
- BlockIdGetBlockNumber(&xlrec->rootblk));
- }
- else
- strcat(buf, "UNKNOWN");
-}
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.72 2002/06/20 20:29:25 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
return true;
}
+/*
+ * _bt_get_endpoint() -- Find the first or last page on a given tree level
+ *
+ * If the index is empty, we will return InvalidBuffer; any other failure
+ * condition causes elog().
+ *
+ * The returned buffer is pinned and read-locked.
+ */
+Buffer
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
+{
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum;
+ BlockNumber blkno;
+ BTItem btitem;
+ IndexTuple itup;
+
+ /*
+ * If we are looking for a leaf page, okay to descend from fast root;
+ * otherwise better descend from true root. (There is no point in being
+ * smarter about intermediate levels.)
+ */
+ if (level == 0)
+ buf = _bt_getroot(rel, BT_READ);
+ else
+ buf = _bt_gettrueroot(rel);
+
+ if (!BufferIsValid(buf))
+ {
+ /* empty index... */
+ return InvalidBuffer;
+ }
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ for (;;)
+ {
+ /*
+ * If we landed on a deleted page, step right to find a live page
+ * (there must be one). Also, if we want the rightmost page,
+ * step right if needed to get to it (this could happen if the
+ * page split since we obtained a pointer to it).
+ */
+ while (P_ISDELETED(opaque) ||
+ (rightmost && !P_RIGHTMOST(opaque)))
+ {
+ blkno = opaque->btpo_next;
+ if (blkno == P_NONE)
+ elog(ERROR, "_bt_get_endpoint: ran off end of btree");
+ _bt_relbuf(rel, buf);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ /* Done? */
+ if (opaque->btpo.level == level)
+ break;
+ if (opaque->btpo.level < level)
+ elog(ERROR, "_bt_get_endpoint: btree level %u not found", level);
+
+ /* Step to leftmost or rightmost child page */
+ if (rightmost)
+ offnum = PageGetMaxOffsetNumber(page);
+ else
+ offnum = P_FIRSTDATAKEY(opaque);
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+ _bt_relbuf(rel, buf);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
+
+ return buf;
+}
+
/*
* _bt_endpoint() -- Find the first or last key in the index.
*
Page page;
BTPageOpaque opaque;
ItemPointer current;
- OffsetNumber offnum,
- maxoff;
+ OffsetNumber maxoff;
OffsetNumber start;
BlockNumber blkno;
BTItem btitem;
* simplified version of _bt_search(). We don't maintain a stack
* since we know we won't need it.
*/
- buf = _bt_getroot(rel, BT_READ);
+ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
if (!BufferIsValid(buf))
{
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISLEAF(opaque));
- for (;;)
- {
- if (P_ISLEAF(opaque))
- break;
-
- if (ScanDirectionIsForward(dir))
- offnum = P_FIRSTDATAKEY(opaque);
- else
- offnum = PageGetMaxOffsetNumber(page);
-
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &(btitem->bti_itup);
- blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
-
- _bt_relbuf(rel, buf);
- buf = _bt_getbuf(rel, blkno, BT_READ);
-
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /*
- * Race condition: If the child page we just stepped onto was just
- * split, we need to make sure we're all the way at the right edge
- * of the tree. See the paper by Lehman and Yao.
- */
- if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
- {
- do
- {
- blkno = opaque->btpo_next;
- _bt_relbuf(rel, buf);
- buf = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- } while (!P_RIGHTMOST(opaque));
- }
- }
-
- /* okay, we've got the {left,right}-most page in the tree */
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
- Assert(P_LEFTMOST(opaque));
+ /* There could be dead pages to the left, so not this: */
+ /* Assert(P_LEFTMOST(opaque)); */
start = P_FIRSTDATAKEY(opaque);
}
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.70 2002/11/15 01:26:08 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/nbtree.h"
+#include "miscadmin.h"
#include "utils/tuplesort.h"
BTItem btps_minkey; /* copy of minimum key (first item) on
* page */
OffsetNumber btps_lastoff; /* last item offset loaded */
- int btps_level; /* tree level (0 = leaf) */
+ uint32 btps_level; /* tree level (0 = leaf) */
Size btps_full; /* "full" if less than this much free
* space */
struct BTPageState *btps_next; /* link to parent level, if any */
0)
-static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags);
-static BTPageState *_bt_pagestate(Relation index, int flags, int level);
+static void _bt_blnewpage(Relation index, Buffer *buf, Page *page,
+ uint32 level);
+static BTPageState *_bt_pagestate(Relation index, uint32 level);
static void _bt_slideleft(Relation index, Buffer buf, Page page);
static void _bt_sortaddtup(Page page, Size itemsize,
BTItem btitem, OffsetNumber itup_off);
* allocate a new, clean btree page, not linked to any siblings.
*/
static void
-_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
+_bt_blnewpage(Relation index, Buffer *buf, Page *page, uint32 level)
{
BTPageOpaque opaque;
/* Initialize BT opaque state */
opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
opaque->btpo_prev = opaque->btpo_next = P_NONE;
- opaque->btpo_flags = flags;
+ opaque->btpo.level = level;
+ opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
/* Make the P_HIKEY line pointer appear allocated */
((PageHeader) *page)->pd_lower += sizeof(ItemIdData);
}
+/*
+ * emit a completed btree page, and release the lock and pin on it.
+ * This is essentially _bt_wrtbuf except we also emit a WAL record.
+ */
+static void
+_bt_blwritepage(Relation index, Buffer buf)
+{
+ Page pg = BufferGetPage(buf);
+
+ /* NO ELOG(ERROR) from here till newpage op is logged */
+ START_CRIT_SECTION();
+
+ /* XLOG stuff */
+ if (!index->rd_istemp)
+ {
+ xl_btree_newpage xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[2];
+
+ xlrec.node = index->rd_node;
+ xlrec.blkno = BufferGetBlockNumber(buf);
+
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfBtreeNewpage;
+ rdata[0].next = &(rdata[1]);
+
+ rdata[1].buffer = buf;
+ rdata[1].data = (char *) pg;
+ rdata[1].len = BLCKSZ;
+ rdata[1].next = NULL;
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWPAGE, rdata);
+
+ PageSetLSN(pg, recptr);
+ PageSetSUI(pg, ThisStartUpID);
+ }
+
+ END_CRIT_SECTION();
+
+ _bt_wrtbuf(index, buf);
+}
+
/*
* allocate and initialize a new BTPageState. the returned structure
* is suitable for immediate use by _bt_buildadd.
*/
static BTPageState *
-_bt_pagestate(Relation index, int flags, int level)
+_bt_pagestate(Relation index, uint32 level)
{
BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
/* create initial page */
- _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
+ _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), level);
state->btps_minkey = (BTItem) NULL;
/* initialize lastoff so first item goes into P_FIRSTKEY */
ItemId hii;
BTItem obti;
- /* Create new page */
- _bt_blnewpage(index, &nbuf, &npage,
- (state->btps_level > 0) ? 0 : BTP_LEAF);
+ /* Create new page on same level */
+ _bt_blnewpage(index, &nbuf, &npage, state->btps_level);
/*
* We copy the last item on the page into the new page, and then
* btree level.
*/
if (state->btps_next == (BTPageState *) NULL)
- {
- state->btps_next =
- _bt_pagestate(index, 0, state->btps_level + 1);
- }
+ state->btps_next = _bt_pagestate(index, state->btps_level + 1);
+
Assert(state->btps_minkey != NULL);
ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid),
BufferGetBlockNumber(obuf), P_HIKEY);
state->btps_minkey = _bt_formitem(&(obti->bti_itup));
/*
- * Set the sibling links for both pages, and parent links too.
- *
- * It's not necessary to set the parent link at all, because it's
- * only used for handling concurrent root splits, but we may as
- * well do it as a debugging aid. Note we set new page's link as
- * well as old's, because if the new page turns out to be the last
- * of the level, _bt_uppershutdown won't change it. The links may
- * be out of date by the time the build finishes, but that's OK;
- * they need only point to a left-sibling of the true parent. See
- * the README file for more info.
+ * Set the sibling links for both pages.
*/
{
BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
oopaque->btpo_next = BufferGetBlockNumber(nbuf);
nopaque->btpo_prev = BufferGetBlockNumber(obuf);
- nopaque->btpo_next = P_NONE;
- oopaque->btpo_parent = nopaque->btpo_parent =
- BufferGetBlockNumber(state->btps_next->btps_buf);
+ nopaque->btpo_next = P_NONE; /* redundant */
}
/*
* can give up our lock (if we had one; most likely BuildingBtree
* is set, so we aren't locking).
*/
- _bt_wrtbuf(index, obuf);
+ _bt_blwritepage(index, obuf);
/*
* Reset last_off to point to new page
* slid back one slot. Then we can dump out the page.
*/
_bt_slideleft(index, s->btps_buf, s->btps_page);
- _bt_wrtbuf(index, s->btps_buf);
+ _bt_blwritepage(index, s->btps_buf);
}
}
/* When we see first tuple, create first index page */
if (state == NULL)
- state = _bt_pagestate(index, BTP_LEAF, 0);
+ state = _bt_pagestate(index, 0);
if (load1)
{
_bt_freeskey(indexScanKey);
}
else
-/* merge is unnecessary */
{
+ /* merge is unnecessary */
while (bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, &should_free), bti != (BTItem) NULL)
{
/* When we see first tuple, create first index page */
if (state == NULL)
- state = _bt_pagestate(index, BTP_LEAF, 0);
+ state = _bt_pagestate(index, 0);
_bt_buildadd(index, state, bti);
if (should_free)
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * nbtxlog.c
+ * WAL replay logic for btrees.
+ *
+ *
+ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.1 2003/02/21 00:06:21 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/xlogutils.h"
+
+
+/*
+ * We must keep track of expected insertions due to page splits, and apply
+ * them manually if they are not seen in the WAL log during replay. This
+ * makes it safe for page insertion to be a multiple-WAL-action process.
+ *
+ * The data structure is a simple linked list --- this should be good enough,
+ * since we don't expect a page split to remain incomplete for long.
+ */
+typedef struct bt_incomplete_split
+{
+ RelFileNode node; /* the index */
+ BlockNumber leftblk; /* left half of split */
+ BlockNumber rightblk; /* right half of split */
+ bool is_root; /* we split the root */
+} bt_incomplete_split;
+
+static List *incomplete_splits;
+
+
+static void
+log_incomplete_split(RelFileNode node, BlockNumber leftblk,
+ BlockNumber rightblk, bool is_root)
+{
+ bt_incomplete_split *split = palloc(sizeof(bt_incomplete_split));
+
+ split->node = node;
+ split->leftblk = leftblk;
+ split->rightblk = rightblk;
+ split->is_root = is_root;
+ incomplete_splits = lappend(incomplete_splits, split);
+}
+
+static void
+forget_matching_split(Relation reln, RelFileNode node,
+ BlockNumber insertblk, OffsetNumber offnum,
+ bool is_root)
+{
+ Buffer buffer;
+ Page page;
+ BTItem btitem;
+ BlockNumber rightblk;
+ List *l;
+
+ /* Get downlink TID from page */
+ buffer = XLogReadBuffer(false, reln, insertblk);
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "forget_matching_split: block unfound");
+ page = (Page) BufferGetPage(buffer);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ rightblk = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid));
+ Assert(ItemPointerGetOffsetNumber(&(btitem->bti_itup.t_tid)) == P_HIKEY);
+ UnlockAndReleaseBuffer(buffer);
+
+ foreach(l, incomplete_splits)
+ {
+ bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
+
+ if (RelFileNodeEquals(node, split->node) &&
+ rightblk == split->rightblk)
+ {
+ if (is_root != split->is_root)
+ elog(LOG, "forget_matching_split: fishy is_root data");
+ incomplete_splits = lremove(split, incomplete_splits);
+ break; /* need not look further */
+ }
+ }
+}
+
+static void
+_bt_restore_page(Page page, char *from, int len)
+{
+ BTItemData btdata;
+ Size itemsz;
+ char *end = from + len;
+
+ for (; from < end;)
+ {
+ memcpy(&btdata, from, sizeof(BTItemData));
+ itemsz = IndexTupleDSize(btdata.bti_itup) +
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
+ itemsz = MAXALIGN(itemsz);
+ if (PageAddItem(page, (Item) from, itemsz,
+ FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
+ elog(PANIC, "_bt_restore_page: can't add item to page");
+ from += itemsz;
+ }
+}
+
+static void
+_bt_restore_meta(Relation reln, XLogRecPtr lsn,
+ BlockNumber root, uint32 level,
+ BlockNumber fastroot, uint32 fastlevel)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTMetaPageData *md;
+ BTPageOpaque pageop;
+
+ metabuf = XLogReadBuffer(true, reln, BTREE_METAPAGE);
+ if (!BufferIsValid(metabuf))
+ elog(PANIC, "_bt_restore_meta: no metapage");
+
+ metapg = BufferGetPage(metabuf);
+ _bt_pageinit(metapg, BufferGetPageSize(metabuf));
+
+ md = BTPageGetMeta(metapg);
+ md->btm_magic = BTREE_MAGIC;
+ md->btm_version = BTREE_VERSION;
+ md->btm_root = root;
+ md->btm_level = level;
+ md->btm_fastroot = fastroot;
+ md->btm_fastlevel = fastlevel;
+
+ pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ pageop->btpo_flags = BTP_META;
+
+ PageSetLSN(metapg, lsn);
+ PageSetSUI(metapg, ThisStartUpID);
+ UnlockAndWriteBuffer(metabuf);
+}
+
+static void
+btree_xlog_insert(bool redo, bool isleaf, bool ismeta,
+ XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
+ Relation reln;
+ Buffer buffer;
+ Page page;
+ BTPageOpaque pageop;
+ char *datapos;
+ int datalen;
+ xl_btree_metadata md;
+
+ datapos = (char *) xlrec + SizeOfBtreeInsert;
+ datalen = record->xl_len - SizeOfBtreeInsert;
+ if (ismeta)
+ {
+ memcpy(&md, datapos, sizeof(xl_btree_metadata));
+ datapos += sizeof(xl_btree_metadata);
+ datalen -= sizeof(xl_btree_metadata);
+ }
+
+ if (redo && (record->xl_info & XLR_BKP_BLOCK_1) && !ismeta &&
+ incomplete_splits == NIL)
+ return; /* nothing to do */
+
+ reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+ if (!RelationIsValid(reln))
+ return;
+
+ if (!redo || !(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(false, reln,
+ ItemPointerGetBlockNumber(&(xlrec->target.tid)));
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un");
+ page = (Page) BufferGetPage(buffer);
+ if (PageIsNew((PageHeader) page))
+ elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un");
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (redo)
+ {
+ if (XLByteLE(lsn, PageGetLSN(page)))
+ {
+ UnlockAndReleaseBuffer(buffer);
+ }
+ else
+ {
+ if (PageAddItem(page, (Item) datapos, datalen,
+ ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
+ LP_USED) == InvalidOffsetNumber)
+ elog(PANIC, "btree_insert_redo: failed to add item");
+
+ PageSetLSN(page, lsn);
+ PageSetSUI(page, ThisStartUpID);
+ UnlockAndWriteBuffer(buffer);
+ }
+ }
+ else
+ {
+ if (XLByteLT(PageGetLSN(page), lsn))
+ elog(PANIC, "btree_insert_undo: bad page LSN");
+
+ if (!P_ISLEAF(pageop))
+ {
+ UnlockAndReleaseBuffer(buffer);
+ }
+ else
+ {
+ elog(PANIC, "btree_insert_undo: unimplemented");
+ }
+ }
+ }
+
+ if (redo) /* metapage changes not undoable */
+ {
+ if (ismeta)
+ _bt_restore_meta(reln, lsn,
+ md.root, md.level,
+ md.fastroot, md.fastlevel);
+ }
+
+ /* Forget any split this insertion completes */
+ if (redo && !isleaf && incomplete_splits != NIL)
+ {
+ forget_matching_split(reln, xlrec->target.node,
+ ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+ ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
+ false);
+ }
+}
+
+static void
+btree_xlog_split(bool redo, bool onleft, bool isroot,
+ XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
+ Relation reln;
+ BlockNumber targetblk;
+ BlockNumber leftsib;
+ BlockNumber rightsib;
+ Buffer buffer;
+ Page page;
+ BTPageOpaque pageop;
+ char *op = (redo) ? "redo" : "undo";
+
+ reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+ if (!RelationIsValid(reln))
+ return;
+
+ targetblk = ItemPointerGetBlockNumber(&(xlrec->target.tid));
+ leftsib = (onleft) ? targetblk : xlrec->otherblk;
+ rightsib = (onleft) ? xlrec->otherblk : targetblk;
+
+ /* Left (original) sibling */
+ buffer = XLogReadBuffer(false, reln, leftsib);
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "btree_split_%s: lost left sibling", op);
+
+ page = (Page) BufferGetPage(buffer);
+ if (redo)
+ _bt_pageinit(page, BufferGetPageSize(buffer));
+ else if (PageIsNew((PageHeader) page))
+ elog(PANIC, "btree_split_undo: uninitialized left sibling");
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (redo)
+ {
+ pageop->btpo_prev = xlrec->leftblk;
+ pageop->btpo_next = rightsib;
+ pageop->btpo.level = xlrec->level;
+ pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+
+ _bt_restore_page(page,
+ (char *) xlrec + SizeOfBtreeSplit,
+ xlrec->leftlen);
+
+ PageSetLSN(page, lsn);
+ PageSetSUI(page, ThisStartUpID);
+ UnlockAndWriteBuffer(buffer);
+ }
+ else
+ {
+ /* undo */
+ if (XLByteLT(PageGetLSN(page), lsn))
+ elog(PANIC, "btree_split_undo: bad left sibling LSN");
+ elog(PANIC, "btree_split_undo: unimplemented");
+ }
+
+ /* Right (new) sibling */
+ buffer = XLogReadBuffer((redo) ? true : false, reln, rightsib);
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "btree_split_%s: lost right sibling", op);
+
+ page = (Page) BufferGetPage(buffer);
+ if (redo)
+ _bt_pageinit(page, BufferGetPageSize(buffer));
+ else if (PageIsNew((PageHeader) page))
+ elog(PANIC, "btree_split_undo: uninitialized right sibling");
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (redo)
+ {
+ pageop->btpo_prev = leftsib;
+ pageop->btpo_next = xlrec->rightblk;
+ pageop->btpo.level = xlrec->level;
+ pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+
+ _bt_restore_page(page,
+ (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
+ record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);
+
+ PageSetLSN(page, lsn);
+ PageSetSUI(page, ThisStartUpID);
+ UnlockAndWriteBuffer(buffer);
+ }
+ else
+ {
+ /* undo */
+ if (XLByteLT(PageGetLSN(page), lsn))
+ elog(PANIC, "btree_split_undo: bad right sibling LSN");
+ elog(PANIC, "btree_split_undo: unimplemented");
+ }
+
+ /* Fix left-link of right (next) page */
+ if (redo && !(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ if (xlrec->rightblk != P_NONE)
+ {
+ buffer = XLogReadBuffer(false, reln, xlrec->rightblk);
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "btree_split_redo: lost next right page");
+
+ page = (Page) BufferGetPage(buffer);
+ if (PageIsNew((PageHeader) page))
+ elog(PANIC, "btree_split_redo: uninitialized next right page");
+
+ if (XLByteLE(lsn, PageGetLSN(page)))
+ {
+ UnlockAndReleaseBuffer(buffer);
+ }
+ else
+ {
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ pageop->btpo_prev = rightsib;
+
+ PageSetLSN(page, lsn);
+ PageSetSUI(page, ThisStartUpID);
+ UnlockAndWriteBuffer(buffer);
+ }
+ }
+ }
+
+ /* Forget any split this insertion completes */
+ if (redo && xlrec->level > 0 && incomplete_splits != NIL)
+ {
+ forget_matching_split(reln, xlrec->target.node,
+ ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+ ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
+ false);
+ }
+
+ /* The job ain't done till the parent link is inserted... */
+ log_incomplete_split(xlrec->target.node,
+ leftsib, rightsib, isroot);
+}
+
+static void
+btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_btree_delete *xlrec;
+ Relation reln;
+ Buffer buffer;
+ Page page;
+
+ if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
+ return;
+
+ xlrec = (xl_btree_delete *) XLogRecGetData(record);
+ reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+ if (!RelationIsValid(reln))
+ return;
+ buffer = XLogReadBuffer(false, reln,
+ ItemPointerGetBlockNumber(&(xlrec->target.tid)));
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "btree_delete_redo: block unfound");
+ page = (Page) BufferGetPage(buffer);
+ if (PageIsNew((PageHeader) page))
+ elog(PANIC, "btree_delete_redo: uninitialized page");
+
+ if (XLByteLE(lsn, PageGetLSN(page)))
+ {
+ UnlockAndReleaseBuffer(buffer);
+ return;
+ }
+
+ PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
+
+ PageSetLSN(page, lsn);
+ PageSetSUI(page, ThisStartUpID);
+ UnlockAndWriteBuffer(buffer);
+}
+
+static void
+btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
+ Relation reln;
+ Buffer buffer;
+ Page page;
+ BTPageOpaque pageop;
+
+ if (!redo)
+ return; /* not undoable */
+
+ reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
+ if (!RelationIsValid(reln))
+ return;
+ buffer = XLogReadBuffer(true, reln, xlrec->rootblk);
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "btree_newroot_redo: no root page");
+
+ page = (Page) BufferGetPage(buffer);
+ _bt_pageinit(page, BufferGetPageSize(buffer));
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ pageop->btpo_flags = BTP_ROOT;
+ pageop->btpo_prev = pageop->btpo_next = P_NONE;
+ pageop->btpo.level = xlrec->level;
+ if (xlrec->level == 0)
+ pageop->btpo_flags |= BTP_LEAF;
+
+ if (record->xl_len > SizeOfBtreeNewroot)
+ _bt_restore_page(page,
+ (char *) xlrec + SizeOfBtreeNewroot,
+ record->xl_len - SizeOfBtreeNewroot);
+
+ PageSetLSN(page, lsn);
+ PageSetSUI(page, ThisStartUpID);
+ UnlockAndWriteBuffer(buffer);
+
+ _bt_restore_meta(reln, lsn,
+ xlrec->rootblk, xlrec->level,
+ xlrec->rootblk, xlrec->level);
+
+ /* Check to see if this satisfies any incomplete insertions */
+ if (record->xl_len > SizeOfBtreeNewroot &&
+ incomplete_splits != NIL)
+ {
+ forget_matching_split(reln, xlrec->node,
+ xlrec->rootblk,
+ P_FIRSTKEY,
+ true);
+ }
+}
+
+static void
+btree_xlog_newmeta(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_btree_newmeta *xlrec = (xl_btree_newmeta *) XLogRecGetData(record);
+ Relation reln;
+
+ if (!redo)
+ return; /* not undoable */
+
+ reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
+ if (!RelationIsValid(reln))
+ return;
+
+ _bt_restore_meta(reln, lsn,
+ xlrec->meta.root, xlrec->meta.level,
+ xlrec->meta.fastroot, xlrec->meta.fastlevel);
+}
+
+static void
+btree_xlog_newpage(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_btree_newpage *xlrec = (xl_btree_newpage *) XLogRecGetData(record);
+ Relation reln;
+ Buffer buffer;
+ Page page;
+
+ if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
+ return;
+
+ reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
+ if (!RelationIsValid(reln))
+ return;
+ buffer = XLogReadBuffer(true, reln, xlrec->blkno);
+ if (!BufferIsValid(buffer))
+ elog(PANIC, "btree_newpage_redo: block unfound");
+ page = (Page) BufferGetPage(buffer);
+
+ Assert(record->xl_len == SizeOfBtreeNewpage + BLCKSZ);
+ memcpy(page, (char *) xlrec + SizeOfBtreeNewpage, BLCKSZ);
+
+ PageSetLSN(page, lsn);
+ PageSetSUI(page, ThisStartUpID);
+ UnlockAndWriteBuffer(buffer);
+}
+
+
+void
+btree_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ case XLOG_BTREE_INSERT_LEAF:
+ btree_xlog_insert(true, true, false, lsn, record);
+ break;
+ case XLOG_BTREE_INSERT_UPPER:
+ btree_xlog_insert(true, false, false, lsn, record);
+ break;
+ case XLOG_BTREE_INSERT_META:
+ btree_xlog_insert(true, false, true, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_L:
+ btree_xlog_split(true, true, false, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_R:
+ btree_xlog_split(true, false, false, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_L_ROOT:
+ btree_xlog_split(true, true, true, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_R_ROOT:
+ btree_xlog_split(true, false, true, lsn, record);
+ break;
+ case XLOG_BTREE_DELETE:
+ btree_xlog_delete(true, lsn, record);
+ break;
+ case XLOG_BTREE_DELETE_PAGE:
+ case XLOG_BTREE_DELETE_PAGE_META:
+ // ???
+ break;
+ case XLOG_BTREE_NEWROOT:
+ btree_xlog_newroot(true, lsn, record);
+ break;
+ case XLOG_BTREE_NEWMETA:
+ btree_xlog_newmeta(true, lsn, record);
+ break;
+ case XLOG_BTREE_NEWPAGE:
+ btree_xlog_newpage(true, lsn, record);
+ break;
+ default:
+ elog(PANIC, "btree_redo: unknown op code %u", info);
+ }
+}
+
+void
+btree_undo(XLogRecPtr lsn, XLogRecord *record)
+{
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ case XLOG_BTREE_INSERT_LEAF:
+ btree_xlog_insert(false, true, false, lsn, record);
+ break;
+ case XLOG_BTREE_INSERT_UPPER:
+ btree_xlog_insert(false, false, false, lsn, record);
+ break;
+ case XLOG_BTREE_INSERT_META:
+ btree_xlog_insert(false, false, true, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_L:
+ btree_xlog_split(false, true, false, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_R:
+ btree_xlog_split(false, false, false, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_L_ROOT:
+ btree_xlog_split(false, true, true, lsn, record);
+ break;
+ case XLOG_BTREE_SPLIT_R_ROOT:
+ btree_xlog_split(false, false, true, lsn, record);
+ break;
+ case XLOG_BTREE_DELETE:
+ btree_xlog_delete(false, lsn, record);
+ break;
+ case XLOG_BTREE_DELETE_PAGE:
+ case XLOG_BTREE_DELETE_PAGE_META:
+ // ???
+ break;
+ case XLOG_BTREE_NEWROOT:
+ btree_xlog_newroot(false, lsn, record);
+ break;
+ case XLOG_BTREE_NEWMETA:
+ btree_xlog_newmeta(false, lsn, record);
+ break;
+ case XLOG_BTREE_NEWPAGE:
+ btree_xlog_newpage(false, lsn, record);
+ break;
+ default:
+ elog(PANIC, "btree_undo: unknown op code %u", info);
+ }
+}
+
+static void
+out_target(char *buf, xl_btreetid *target)
+{
+ sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u",
+ target->node.tblNode, target->node.relNode,
+ ItemPointerGetBlockNumber(&(target->tid)),
+ ItemPointerGetOffsetNumber(&(target->tid)));
+}
+
+void
+btree_desc(char *buf, uint8 xl_info, char *rec)
+{
+ uint8 info = xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ case XLOG_BTREE_INSERT_LEAF:
+ {
+ xl_btree_insert *xlrec = (xl_btree_insert *) rec;
+
+ strcat(buf, "insert: ");
+ out_target(buf, &(xlrec->target));
+ break;
+ }
+ case XLOG_BTREE_INSERT_UPPER:
+ {
+ xl_btree_insert *xlrec = (xl_btree_insert *) rec;
+
+ strcat(buf, "insert_upper: ");
+ out_target(buf, &(xlrec->target));
+ break;
+ }
+ case XLOG_BTREE_INSERT_META:
+ {
+ xl_btree_insert *xlrec = (xl_btree_insert *) rec;
+
+ strcat(buf, "insert_meta: ");
+ out_target(buf, &(xlrec->target));
+ break;
+ }
+ case XLOG_BTREE_SPLIT_L:
+ {
+ xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+ strcat(buf, "split_l: ");
+ out_target(buf, &(xlrec->target));
+ sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+ xlrec->otherblk, xlrec->rightblk);
+ break;
+ }
+ case XLOG_BTREE_SPLIT_R:
+ {
+ xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+ strcat(buf, "split_r: ");
+ out_target(buf, &(xlrec->target));
+ sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+ xlrec->otherblk, xlrec->rightblk);
+ break;
+ }
+ case XLOG_BTREE_SPLIT_L_ROOT:
+ {
+ xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+ strcat(buf, "split_l_root: ");
+ out_target(buf, &(xlrec->target));
+ sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+ xlrec->otherblk, xlrec->rightblk);
+ break;
+ }
+ case XLOG_BTREE_SPLIT_R_ROOT:
+ {
+ xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+ strcat(buf, "split_r_root: ");
+ out_target(buf, &(xlrec->target));
+ sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+ xlrec->otherblk, xlrec->rightblk);
+ break;
+ }
+ case XLOG_BTREE_DELETE:
+ {
+ xl_btree_delete *xlrec = (xl_btree_delete *) rec;
+
+ strcat(buf, "delete: ");
+ out_target(buf, &(xlrec->target));
+ break;
+ }
+ case XLOG_BTREE_DELETE_PAGE:
+ case XLOG_BTREE_DELETE_PAGE_META:
+ {
+ xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;
+
+ strcat(buf, "delete_page: ");
+ out_target(buf, &(xlrec->target));
+ sprintf(buf + strlen(buf), "; dead %u; left %u; right %u",
+ xlrec->deadblk, xlrec->leftblk, xlrec->rightblk);
+ break;
+ }
+ case XLOG_BTREE_NEWROOT:
+ {
+ xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;
+
+ sprintf(buf + strlen(buf), "newroot: node %u/%u; root %u lev %u",
+ xlrec->node.tblNode, xlrec->node.relNode,
+ xlrec->rootblk, xlrec->level);
+ break;
+ }
+ case XLOG_BTREE_NEWMETA:
+ {
+ xl_btree_newmeta *xlrec = (xl_btree_newmeta *) rec;
+
+ sprintf(buf + strlen(buf), "newmeta: node %u/%u; root %u lev %u fast %u lev %u",
+ xlrec->node.tblNode, xlrec->node.relNode,
+ xlrec->meta.root, xlrec->meta.level,
+ xlrec->meta.fastroot, xlrec->meta.fastlevel);
+ break;
+ }
+ case XLOG_BTREE_NEWPAGE:
+ {
+ xl_btree_newpage *xlrec = (xl_btree_newpage *) rec;
+
+ sprintf(buf + strlen(buf), "newpage: node %u/%u; page %u",
+ xlrec->node.tblNode, xlrec->node.relNode,
+ xlrec->blkno);
+ break;
+ }
+ default:
+ strcat(buf, "UNKNOWN");
+ break;
+ }
+}
+
+void
+btree_xlog_startup(void)
+{
+ incomplete_splits = NIL;
+}
+
+void
+btree_xlog_cleanup(void)
+{
+ List *l;
+
+ foreach(l, incomplete_splits)
+ {
+ bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
+ Relation reln;
+ Buffer lbuf,
+ rbuf;
+ Page lpage,
+ rpage;
+ BTPageOpaque lpageop,
+ rpageop;
+ bool is_only;
+
+ reln = XLogOpenRelation(true, RM_BTREE_ID, split->node);
+ if (!RelationIsValid(reln))
+ continue;
+ lbuf = XLogReadBuffer(false, reln, split->leftblk);
+ if (!BufferIsValid(lbuf))
+ elog(PANIC, "btree_xlog_cleanup: left block unfound");
+ lpage = (Page) BufferGetPage(lbuf);
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
+ rbuf = XLogReadBuffer(false, reln, split->rightblk);
+ if (!BufferIsValid(rbuf))
+ elog(PANIC, "btree_xlog_cleanup: right block unfound");
+ rpage = (Page) BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /* if the two pages are all of their level, it's a only-page split */
+ is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
+
+ _bt_insert_parent(reln, lbuf, rbuf, (BTStack) NULL,
+ split->is_root, is_only);
+ }
+ incomplete_splits = NIL;
+}
*
* Resource managers definition
*
- * $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.9 2001/08/25 18:52:41 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.10 2003/02/21 00:06:22 tgl Exp $
*/
#include "postgres.h"
#include "commands/sequence.h"
-RmgrData RmgrTable[] = {
- {"XLOG", xlog_redo, xlog_undo, xlog_desc},
- {"Transaction", xact_redo, xact_undo, xact_desc},
- {"Storage", smgr_redo, smgr_undo, smgr_desc},
- {"CLOG", clog_redo, clog_undo, clog_desc},
- {"Reserved 4", NULL, NULL, NULL},
- {"Reserved 5", NULL, NULL, NULL},
- {"Reserved 6", NULL, NULL, NULL},
- {"Reserved 7", NULL, NULL, NULL},
- {"Reserved 8", NULL, NULL, NULL},
- {"Reserved 9", NULL, NULL, NULL},
- {"Heap", heap_redo, heap_undo, heap_desc},
- {"Btree", btree_redo, btree_undo, btree_desc},
- {"Hash", hash_redo, hash_undo, hash_desc},
- {"Rtree", rtree_redo, rtree_undo, rtree_desc},
- {"Gist", gist_redo, gist_undo, gist_desc},
- {"Sequence", seq_redo, seq_undo, seq_desc}
+RmgrData RmgrTable[RM_MAX_ID+1] = {
+ {"XLOG", xlog_redo, xlog_undo, xlog_desc, NULL, NULL},
+ {"Transaction", xact_redo, xact_undo, xact_desc, NULL, NULL},
+ {"Storage", smgr_redo, smgr_undo, smgr_desc, NULL, NULL},
+ {"CLOG", clog_redo, clog_undo, clog_desc, NULL, NULL},
+ {"Reserved 4", NULL, NULL, NULL, NULL, NULL},
+ {"Reserved 5", NULL, NULL, NULL, NULL, NULL},
+ {"Reserved 6", NULL, NULL, NULL, NULL, NULL},
+ {"Reserved 7", NULL, NULL, NULL, NULL, NULL},
+ {"Reserved 8", NULL, NULL, NULL, NULL, NULL},
+ {"Reserved 9", NULL, NULL, NULL, NULL, NULL},
+ {"Heap", heap_redo, heap_undo, heap_desc, NULL, NULL},
+ {"Btree", btree_redo, btree_undo, btree_desc,
+ btree_xlog_startup, btree_xlog_cleanup},
+ {"Hash", hash_redo, hash_undo, hash_desc, NULL, NULL},
+ {"Rtree", rtree_redo, rtree_undo, rtree_desc, NULL, NULL},
+ {"Gist", gist_redo, gist_undo, gist_desc, NULL, NULL},
+ {"Sequence", seq_redo, seq_undo, seq_desc, NULL, NULL}
};
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.111 2003/01/25 03:06:04 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.112 2003/02/21 00:06:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
- if (XLOG_DEBUG)
- {
- elog(LOG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X",
- (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
- (InRedo) ? "(redo)" : "",
- record.xlogid, record.xrecoff,
- LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
- LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
- }
-
/* Disabled during REDO */
if (InRedo)
return;
if (XLByteLE(record, LogwrtResult.Flush))
return;
+ if (XLOG_DEBUG)
+ {
+ elog(LOG, "XLogFlush%s: request %X/%X; write %X/%X; flush %X/%X",
+ (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
+ record.xlogid, record.xrecoff,
+ LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
+ LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
+ }
+
START_CRIT_SECTION();
/*
elog(LOG, "database system was interrupted at %s",
str_time(ControlFile->time));
+ /* This is just to allow attaching to startup process with a debugger */
+#ifdef XLOG_REPLAY_DELAY
+ if (XLOG_DEBUG && ControlFile->state != DB_SHUTDOWNED)
+ sleep(60);
+#endif
+
/*
* Get the last valid checkpoint record. If the latest one according
* to pg_control is broken, try the next-to-last one.
/* REDO */
if (InRecovery)
{
+ int rmid;
+
elog(LOG, "database system was not properly shut down; "
"automatic recovery in progress");
ControlFile->state = DB_IN_RECOVERY;
ControlFile->time = time(NULL);
UpdateControlFile();
+ /* Start up the recovery environment */
XLogInitRelationCache();
+ for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (RmgrTable[rmid].rm_startup != NULL)
+ RmgrTable[rmid].rm_startup();
+ }
+
/* Is REDO required ? */
if (XLByteLT(checkPoint.redo, RecPtr))
record = ReadRecord(&(checkPoint.redo), PANIC, buffer);
if (InRecovery)
{
+ int rmid;
+
+ /*
+ * Allow resource managers to do any required cleanup.
+ */
+ for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (RmgrTable[rmid].rm_cleanup != NULL)
+ RmgrTable[rmid].rm_cleanup();
+ }
+
+ /* suppress in-transaction check in CreateCheckPoint */
+ MyLastRecPtr.xrecoff = 0;
+ MyXactMadeXLogEntry = false;
+ MyXactMadeTempRelUpdate = false;
+
/*
+ * Perform a new checkpoint to update our recovery activity to disk.
+ *
* In case we had to use the secondary checkpoint, make sure that
* it will still be shown as the secondary checkpoint after this
* CreateCheckPoint operation; we don't want the broken primary
*/
ControlFile->checkPoint = checkPointLoc;
CreateCheckPoint(true, true);
+
+ /*
+ * Close down recovery environment
+ */
XLogCloseRelationCache();
}
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: nbtree.h,v 1.63 2002/07/02 05:48:44 momjian Exp $
+ * $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/*
* BTPageOpaqueData -- At the end of every page, we store a pointer
* to both siblings in the tree. This is used to do forward/backward
- * index scans. See Lehman and Yao's paper for more
- * info. In addition, we need to know what type of page this is
- * (leaf or internal), and whether the page is available for reuse.
+ * index scans. The next-page link is also critical for recovery when
+ * a search has navigated to the wrong page due to concurrent page splits
+ * or deletions; see src/backend/access/nbtree/README for more info.
*
- * We also store a back-link to the parent page, but this cannot be trusted
- * very far since it does not get updated when the parent is split.
- * See backend/access/nbtree/README for details.
+ * In addition, we store the page's btree level (counting upwards from
+ * zero at a leaf page) as well as some flag bits indicating the page type
+ * and status. If the page is deleted, we replace the level with the
+ * next-transaction-ID value indicating when it is safe to reclaim the page.
+ *
+ * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
+ * instead.
*/
typedef struct BTPageOpaqueData
{
- BlockNumber btpo_prev; /* used for backward index scans */
- BlockNumber btpo_next; /* used for forward index scans */
- BlockNumber btpo_parent; /* pointer to parent, but not updated on
- * parent split */
- uint16 btpo_flags; /* LEAF?, ROOT?, FREE?, META?, REORDER? */
-
+ BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
+ BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
+ union
+ {
+ uint32 level; /* tree level --- zero for leaf pages */
+ TransactionId xact; /* next transaction ID, if deleted */
+ } btpo;
+ uint16 btpo_flags; /* flag bits, see below */
} BTPageOpaqueData;
typedef BTPageOpaqueData *BTPageOpaque;
/* Bits defined in btpo_flags */
-#define BTP_LEAF (1 << 0) /* leaf page, if not internal page */
+#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
-#define BTP_FREE (1 << 2) /* page not in use */
+#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
#define BTP_META (1 << 3) /* meta-page */
-#define BTP_REORDER (1 << 4) /* items need reordering */
/*
* The Meta page is always the first page in the btree index.
* Its primary purpose is to point to the location of the btree root page.
+ * We also point to the "fast" root, which is the current effective root;
+ * see README for discussion.
*/
typedef struct BTMetaPageData
{
- uint32 btm_magic;
- uint32 btm_version;
- BlockNumber btm_root;
- int32 btm_level;
+ uint32 btm_magic; /* should contain BTREE_MAGIC */
+ uint32 btm_version; /* should contain BTREE_VERSION */
+ BlockNumber btm_root; /* current root location */
+ uint32 btm_level; /* tree level of the root page */
+ BlockNumber btm_fastroot; /* current "fast" root location */
+ uint32 btm_fastlevel; /* tree level of the "fast" root page */
} BTMetaPageData;
#define BTPageGetMeta(p) \
#define BTREE_METAPAGE 0 /* first page is meta */
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
-
-#define BTreeInvalidParent(opaque) \
- (opaque->btpo_parent == InvalidBlockNumber || \
- opaque->btpo_parent == BTREE_METAPAGE)
-
-#define BTREE_VERSION 1
+#define BTREE_VERSION 2 /* current version number */
/*
* We actually need to be able to fit three items on every page,
((PageGetPageSize(page) - \
sizeof(PageHeaderData) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
-/*
- * BTScanOpaqueData is used to remember which buffers we're currently
- * examining in the scan. We keep these buffers pinned (but not locked,
- * see nbtree.c) and recorded in the opaque entry of the scan to avoid
- * doing a ReadBuffer() for every tuple in the index.
- *
- * And it's used to remember actual scankey info (we need it
- * if some scankeys evaled at runtime).
- *
- * curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked
- * index tuples: we don't adjust scans on insertions (and, if LLL
- * is ON, don't hold locks on index pages between passes) - we
- * use these pointers to restore index scan positions...
- * - vadim 07/29/98
- */
-
-typedef struct BTScanOpaqueData
-{
- Buffer btso_curbuf;
- Buffer btso_mrkbuf;
- ItemPointerData curHeapIptr;
- ItemPointerData mrkHeapIptr;
- /* these fields are set by _bt_orderkeys(), which see for more info: */
- bool qual_ok; /* false if qual can never be satisfied */
- int numberOfKeys; /* number of scan keys */
- int numberOfRequiredKeys; /* number of keys that must be
- * matched to continue the scan */
- ScanKey keyData; /* array of scan keys */
-} BTScanOpaqueData;
-
-typedef BTScanOpaqueData *BTScanOpaque;
/*
* BTItems are what we store in the btree. Each item is an index tuple,
typedef BTItemData *BTItem;
/*
- * For XLOG: size without alignement. Sizeof works as long as
+ * For XLOG: size without alignment. Sizeof works as long as
* IndexTupleData has exactly 8 bytes.
*/
#define SizeOfBTItem sizeof(BTItemData)
(i1)->bti_itup.t_tid.ip_posid == \
(i2)->bti_itup.t_tid.ip_posid )
-/*
- * BTStackData -- As we descend a tree, we push the (key, pointer)
- * pairs from internal nodes onto a private stack. If we split a
- * leaf, we use this stack to walk back up the tree and insert data
- * into parent nodes (and possibly to split them, too). Lehman and
- * Yao's update algorithm guarantees that under no circumstances can
- * our private stack give us an irredeemably bad picture up the tree.
- * Again, see the paper for details.
- */
-
-typedef struct BTStackData
-{
- BlockNumber bts_blkno;
- OffsetNumber bts_offset;
- BTItemData bts_btitem;
- struct BTStackData *bts_parent;
-} BTStackData;
-
-typedef BTStackData *BTStack;
-
-/*
- * We need to be able to tell the difference between read and write
- * requests for pages, in order to do locking correctly.
- */
-
-#define BT_READ BUFFER_LOCK_SHARE
-#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
-
/*
* In general, the btree code tries to localize its knowledge about
* page layout to a couple of routines. However, we need a special
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
+#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
/*
+ * XLOG records for btree operations
+ *
* XLOG allows to store some information in high 4 bits of log
* record xl_info field
*/
-#define XLOG_BTREE_DELETE 0x00 /* delete btitem */
-#define XLOG_BTREE_INSERT 0x10 /* add btitem without split */
-#define XLOG_BTREE_SPLIT 0x20 /* add btitem with split */
-#define XLOG_BTREE_SPLEFT 0x30 /* as above + flag that new btitem */
- /* goes to the left sibling */
-#define XLOG_BTREE_NEWROOT 0x40 /* new root page */
-
-#define XLOG_BTREE_LEAF 0x80 /* leaf/internal page was changed */
+#define XLOG_BTREE_INSERT_LEAF 0x00 /* add btitem without split */
+#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
+#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
+#define XLOG_BTREE_SPLIT_L 0x30 /* add btitem with split */
+#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
+#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add btitem with split of root */
+#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
+#define XLOG_BTREE_DELETE 0x70 /* delete leaf btitem */
+#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
+#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, plus update metapage */
+#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
+#define XLOG_BTREE_NEWMETA 0xB0 /* update metadata page */
+#define XLOG_BTREE_NEWPAGE 0xC0 /* new index page during build */
/*
- * All what we need to find changed index tuple
+ * All that we need to find changed index tuple
*/
typedef struct xl_btreetid
{
} xl_btreetid;
/*
- * This is what we need to know about delete
+ * All that we need to regenerate the meta-data page
*/
-typedef struct xl_btree_delete
+typedef struct xl_btree_metadata
{
- xl_btreetid target; /* deleted tuple id */
-} xl_btree_delete;
-
-#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
+ BlockNumber root;
+ uint32 level;
+ BlockNumber fastroot;
+ uint32 fastlevel;
+} xl_btree_metadata;
/*
- * This is what we need to know about pure (without split) insert
+ * This is what we need to know about simple (without split) insert.
+ *
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
+ * Note that INSERT_META implies it's not a leaf page.
*/
typedef struct xl_btree_insert
{
xl_btreetid target; /* inserted tuple id */
+ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
/* BTITEM FOLLOWS AT END OF STRUCT */
} xl_btree_insert;
/*
* On insert with split we save items of both left and right siblings
- * and restore content of both pages from log record
+ * and restore content of both pages from log record. This way takes less
+ * xlog space than the normal approach, because if we did it standardly,
+ * XLogInsert would almost always think the right page is new and store its
+ * whole page image.
+ *
+ * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
+ * The _L and _R variants indicate whether the inserted btitem went into the
+ * left or right split page (and thus, whether otherblk is the right or left
+ * page of the split pair). The _ROOT variants indicate that we are splitting
+ * the root page, and thus that a newroot record rather than an insert or
+ * split record should follow. Note that a split record never carries a
+ * metapage update --- we'll do that in the parent-level update.
*/
typedef struct xl_btree_split
{
xl_btreetid target; /* inserted tuple id */
- BlockIdData otherblk; /* second block participated in split: */
+ BlockNumber otherblk; /* second block participated in split: */
/* first one is stored in target' tid */
- BlockIdData parentblk; /* parent block */
- BlockIdData leftblk; /* prev left block */
- BlockIdData rightblk; /* next right block */
+ BlockNumber leftblk; /* prev/left block */
+ BlockNumber rightblk; /* next/right block */
+ uint32 level; /* tree level of page being split */
uint16 leftlen; /* len of left page items below */
- /* LEFT AND RIGHT PAGES ITEMS FOLLOW AT THE END */
+ /* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
} xl_btree_split;
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
/*
- * New root log record.
+ * This is what we need to know about delete of an individual leaf btitem
+ */
+typedef struct xl_btree_delete
+{
+ xl_btreetid target; /* deleted tuple id */
+} xl_btree_delete;
+
+#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
+
+/*
+ * This is what we need to know about deletion of a btree page. The target
+ * identifies the tuple removed from the parent page (note that we remove
+ * this tuple's downlink and the *following* tuple's key). Note we do not
+ * store any content for the deleted page --- it is just rewritten as empty
+ * during recovery.
+ */
+typedef struct xl_btree_delete_page
+{
+ xl_btreetid target; /* deleted tuple id in parent page */
+ BlockNumber deadblk; /* child block being deleted */
+ BlockNumber leftblk; /* child block's left sibling, if any */
+ BlockNumber rightblk; /* child block's right sibling */
+ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
+} xl_btree_delete_page;
+
+#define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
+
+/*
+ * New root log record. There are zero btitems if this is to establish an
+ * empty root, or two if it is the result of splitting an old root.
+ *
+ * Note that although this implies rewriting the metadata page, we don't need
+ * an xl_btree_metadata record --- the rootblk and level are sufficient.
*/
typedef struct xl_btree_newroot
{
RelFileNode node;
- int32 level;
- BlockIdData rootblk;
+ BlockNumber rootblk; /* location of new root */
+ uint32 level; /* its tree level */
/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
} xl_btree_newroot;
-#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, rootblk) + sizeof(BlockIdData))
+#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
+
+/*
+ * New metapage log record. This is not issued during routine operations;
+ * it's only used when initializing an empty index and at completion of
+ * index build.
+ */
+typedef struct xl_btree_newmeta
+{
+ RelFileNode node;
+ xl_btree_metadata meta;
+} xl_btree_newmeta;
+
+#define SizeOfBtreeNewmeta (sizeof(xl_btree_newmeta))
+
+/*
+ * New index page log record. This is only used while building a new index.
+ */
+typedef struct xl_btree_newpage
+{
+ RelFileNode node;
+ BlockNumber blkno; /* location of new page */
+ /* entire page contents follow at end of record */
+} xl_btree_newpage;
+
+#define SizeOfBtreeNewpage (offsetof(xl_btree_newpage, blkno) + sizeof(BlockNumber))
+
/*
* Operator strategy numbers -- ordering of these is <, <=, =, >=, >
#define BTORDER_PROC 1
+/*
+ * We need to be able to tell the difference between read and write
+ * requests for pages, in order to do locking correctly.
+ */
+
+#define BT_READ BUFFER_LOCK_SHARE
+#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
+
+/*
+ * BTStackData -- As we descend a tree, we push the (location, downlink)
+ * pairs from internal pages onto a private stack. If we split a
+ * leaf, we use this stack to walk back up the tree and insert data
+ * into parent pages (and possibly to split them, too). Lehman and
+ * Yao's update algorithm guarantees that under no circumstances can
+ * our private stack give us an irredeemably bad picture up the tree.
+ * Again, see the paper for details.
+ */
+
+typedef struct BTStackData
+{
+ BlockNumber bts_blkno;
+ OffsetNumber bts_offset;
+ BTItemData bts_btitem;
+ struct BTStackData *bts_parent;
+} BTStackData;
+
+typedef BTStackData *BTStack;
+
+/*
+ * BTScanOpaqueData is used to remember which buffers we're currently
+ * examining in the scan. We keep these buffers pinned (but not locked,
+ * see nbtree.c) and recorded in the opaque entry of the scan to avoid
+ * doing a ReadBuffer() for every tuple in the index.
+ *
+ * And it's used to remember actual scankey info (we need it
+ * if some scankeys evaled at runtime).
+ *
+ * curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked
+ * index tuples: we don't adjust scans on insertions (and, if LLL
+ * is ON, don't hold locks on index pages between passes) - we
+ * use these pointers to restore index scan positions...
+ * - vadim 07/29/98
+ */
+
+typedef struct BTScanOpaqueData
+{
+ Buffer btso_curbuf;
+ Buffer btso_mrkbuf;
+ ItemPointerData curHeapIptr;
+ ItemPointerData mrkHeapIptr;
+ /* these fields are set by _bt_orderkeys(), which see for more info: */
+ bool qual_ok; /* false if qual can never be satisfied */
+ int numberOfKeys; /* number of scan keys */
+ int numberOfRequiredKeys; /* number of keys that must be
+ * matched to continue the scan */
+ ScanKey keyData; /* array of scan keys */
+} BTScanOpaqueData;
+
+typedef BTScanOpaqueData *BTScanOpaque;
+
/*
* prototypes for functions in nbtree.c (external entry points for btree)
*/
extern Datum btrestrpos(PG_FUNCTION_ARGS);
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
-extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
-extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
-extern void btree_desc(char *buf, uint8 xl_info, char *rec);
-
/*
* prototypes for functions in nbtinsert.c
*/
extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
bool index_is_unique, Relation heapRel);
+extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+ BTStack stack, bool is_root, bool is_only);
/*
* prototypes for functions in nbtpage.c
*/
extern void _bt_metapinit(Relation rel);
extern Buffer _bt_getroot(Relation rel, int access);
+extern Buffer _bt_gettrueroot(Relation rel);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_wrtbuf(Relation rel, Buffer buf);
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
-extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level);
+extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
/*
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
/*
* prototypes for functions in nbtstrat.c
extern void _bt_spool(BTItem btitem, BTSpool *btspool);
extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
+/*
+ * prototypes for functions in nbtxlog.c
+ */
+extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
+extern void btree_desc(char *buf, uint8 xl_info, char *rec);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
+
#endif /* NBTREE_H */
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: xlog.h,v 1.40 2002/11/15 02:44:57 momjian Exp $
+ * $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $
*/
#ifndef XLOG_H
#define XLOG_H
*/
typedef struct RmgrData
{
- char *rm_name;
+ const char *rm_name;
void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr);
void (*rm_undo) (XLogRecPtr lsn, XLogRecord *rptr);
void (*rm_desc) (char *buf, uint8 xl_info, char *rec);
+ void (*rm_startup) (void);
+ void (*rm_cleanup) (void);
} RmgrData;
extern RmgrData RmgrTable[];
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $Id: catversion.h,v 1.177 2003/02/16 02:30:39 tgl Exp $
+ * $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 200302151
+#define CATALOG_VERSION_NO 200302171
#endif