granicus.if.org Git - postgresql/blob - src/include/access/nbtree.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtree.h
   4  *        header file for postgres btree access method implementation.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/include/access/nbtree.h
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #ifndef NBTREE_H
  15 #define NBTREE_H
  16
  17 #include "access/genam.h"
  18 #include "access/itup.h"
  19 #include "access/sdir.h"
  20 #include "access/xlog.h"
  21 #include "access/xlogutils.h"
  22 #include "catalog/pg_index.h"
  23
  24 /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
  25 typedef uint16 BTCycleId;
  26
  27 /*
  28  *      BTPageOpaqueData -- At the end of every page, we store a pointer
  29  *      to both siblings in the tree.  This is used to do forward/backward
  30  *      index scans.  The next-page link is also critical for recovery when
  31  *      a search has navigated to the wrong page due to concurrent page splits
  32  *      or deletions; see src/backend/access/nbtree/README for more info.
  33  *
  34  *      In addition, we store the page's btree level (counting upwards from
  35  *      zero at a leaf page) as well as some flag bits indicating the page type
  36  *      and status.  If the page is deleted, we replace the level with the
  37  *      next-transaction-ID value indicating when it is safe to reclaim the page.
  38  *
  39  *      We also store a "vacuum cycle ID".      When a page is split while VACUUM is
  40  *      processing the index, a nonzero value associated with the VACUUM run is
  41  *      stored into both halves of the split page.      (If VACUUM is not running,
  42  *      both pages receive zero cycleids.)      This allows VACUUM to detect whether
  43  *      a page was split since it started, with a small probability of false match
  44  *      if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
  45  *      ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
  46  *      (original) page, and set in the right page, but only if the next page
  47  *      to its right has a different cycleid.
  48  *
  49  *      NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
  50  *      instead.
  51  */
  52
  53 typedef struct BTPageOpaqueData
  54 {
  55         BlockNumber btpo_prev;          /* left sibling, or P_NONE if leftmost */
  56         BlockNumber btpo_next;          /* right sibling, or P_NONE if rightmost */
  57         union
  58         {
  59                 uint32          level;          /* tree level --- zero for leaf pages */
  60                 TransactionId xact;             /* next transaction ID, if deleted */
  61         }                       btpo;
  62         uint16          btpo_flags;             /* flag bits, see below */
  63         BTCycleId       btpo_cycleid;   /* vacuum cycle ID of latest split */
  64 } BTPageOpaqueData;
  65
  66 typedef BTPageOpaqueData *BTPageOpaque;
  67
  68 /* Bits defined in btpo_flags */
  69 #define BTP_LEAF                (1 << 0)        /* leaf page, i.e. not internal page */
  70 #define BTP_ROOT                (1 << 1)        /* root page (has no parent) */
  71 #define BTP_DELETED             (1 << 2)        /* page has been deleted from tree */
  72 #define BTP_META                (1 << 3)        /* meta-page */
  73 #define BTP_HALF_DEAD   (1 << 4)        /* empty, but still in tree */
  74 #define BTP_SPLIT_END   (1 << 5)        /* rightmost page of split group */
  75 #define BTP_HAS_GARBAGE (1 << 6)        /* page has LP_DEAD tuples */
  76
  77 /*
  78  * The max allowed value of a cycle ID is a bit less than 64K.  This is
  79  * for convenience of pg_filedump and similar utilities: we want to use
  80  * the last 2 bytes of special space as an index type indicator, and
  81  * restricting cycle ID lets btree use that space for vacuum cycle IDs
  82  * while still allowing index type to be identified.
  83  */
  84 #define MAX_BT_CYCLE_ID         0xFF7F
  85
  86
  87 /*
  88  * The Meta page is always the first page in the btree index.
  89  * Its primary purpose is to point to the location of the btree root page.
  90  * We also point to the "fast" root, which is the current effective root;
  91  * see README for discussion.
  92  */
  93
  94 typedef struct BTMetaPageData
  95 {
  96         uint32          btm_magic;              /* should contain BTREE_MAGIC */
  97         uint32          btm_version;    /* should contain BTREE_VERSION */
  98         BlockNumber btm_root;           /* current root location */
  99         uint32          btm_level;              /* tree level of the root page */
 100         BlockNumber btm_fastroot;       /* current "fast" root location */
 101         uint32          btm_fastlevel;  /* tree level of the "fast" root page */
 102 } BTMetaPageData;
 103
 104 #define BTPageGetMeta(p) \
 105         ((BTMetaPageData *) PageGetContents(p))
 106
 107 #define BTREE_METAPAGE  0               /* first page is meta */
 108 #define BTREE_MAGIC             0x053162        /* magic number of btree pages */
 109 #define BTREE_VERSION   2               /* current version number */
 110
 111 /*
 112  * Maximum size of a btree index entry, including its tuple header.
 113  *
 114  * We actually need to be able to fit three items on every page,
 115  * so restrict any one item to 1/3 the per-page available space.
 116  */
 117 #define BTMaxItemSize(page) \
 118         MAXALIGN_DOWN((PageGetPageSize(page) - \
 119                                    MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 120                                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
 121
 122 /*
 123  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
 124  * For pages above the leaf level, we use a fixed 70% fillfactor.
 125  * The fillfactor is applied during index build and when splitting
 126  * a rightmost page; when splitting non-rightmost pages we try to
 127  * divide the data equally.
 128  */
 129 #define BTREE_MIN_FILLFACTOR            10
 130 #define BTREE_DEFAULT_FILLFACTOR        90
 131 #define BTREE_NONLEAF_FILLFACTOR        70
 132
 133 /*
 134  *      Test whether two btree entries are "the same".
 135  *
 136  *      Old comments:
 137  *      In addition, we must guarantee that all tuples in the index are unique,
 138  *      in order to satisfy some assumptions in Lehman and Yao.  The way that we
 139  *      do this is by generating a new OID for every insertion that we do in the
 140  *      tree.  This adds eight bytes to the size of btree index tuples.  Note
 141  *      that we do not use the OID as part of a composite key; the OID only
 142  *      serves as a unique identifier for a given index tuple (logical position
 143  *      within a page).
 144  *
 145  *      New comments:
 146  *      actually, we must guarantee that all tuples in A LEVEL
 147  *      are unique, not in ALL INDEX. So, we can use the t_tid
 148  *      as unique identifier for a given index tuple (logical position
 149  *      within a level). - vadim 04/09/97
 150  */
 151 #define BTTidSame(i1, i2)       \
 152         ( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
 153           (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
 154           (i1).ip_posid == (i2).ip_posid )
 155 #define BTEntrySame(i1, i2) \
 156         BTTidSame((i1)->t_tid, (i2)->t_tid)
 157
 158
 159 /*
 160  *      In general, the btree code tries to localize its knowledge about
 161  *      page layout to a couple of routines.  However, we need a special
 162  *      value to indicate "no page number" in those places where we expect
 163  *      page numbers.  We can use zero for this because we never need to
 164  *      make a pointer to the metadata page.
 165  */
 166
 167 #define P_NONE                  0
 168
 169 /*
 170  * Macros to test whether a page is leftmost or rightmost on its tree level,
 171  * as well as other state info kept in the opaque data.
 172  */
 173 #define P_LEFTMOST(opaque)              ((opaque)->btpo_prev == P_NONE)
 174 #define P_RIGHTMOST(opaque)             ((opaque)->btpo_next == P_NONE)
 175 #define P_ISLEAF(opaque)                ((opaque)->btpo_flags & BTP_LEAF)
 176 #define P_ISROOT(opaque)                ((opaque)->btpo_flags & BTP_ROOT)
 177 #define P_ISDELETED(opaque)             ((opaque)->btpo_flags & BTP_DELETED)
 178 #define P_ISHALFDEAD(opaque)    ((opaque)->btpo_flags & BTP_HALF_DEAD)
 179 #define P_IGNORE(opaque)                ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
 180 #define P_HAS_GARBAGE(opaque)   ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
 181
 182 /*
 183  *      Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
 184  *      page.  The high key is not a data key, but gives info about what range of
 185  *      keys is supposed to be on this page.  The high key on a page is required
 186  *      to be greater than or equal to any data key that appears on the page.
 187  *      If we find ourselves trying to insert a key > high key, we know we need
 188  *      to move right (this should only happen if the page was split since we
 189  *      examined the parent page).
 190  *
 191  *      Our insertion algorithm guarantees that we can use the initial least key
 192  *      on our right sibling as the high key.  Once a page is created, its high
 193  *      key changes only if the page is split.
 194  *
 195  *      On a non-rightmost page, the high key lives in item 1 and data items
 196  *      start in item 2.  Rightmost pages have no high key, so we store data
 197  *      items beginning in item 1.
 198  */
 199
 200 #define P_HIKEY                         ((OffsetNumber) 1)
 201 #define P_FIRSTKEY                      ((OffsetNumber) 2)
 202 #define P_FIRSTDATAKEY(opaque)  (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 203
 204 /*
 205  * XLOG records for btree operations
 206  *
 207  * XLOG allows to store some information in high 4 bits of log
 208  * record xl_info field
 209  */
 210 #define XLOG_BTREE_INSERT_LEAF  0x00    /* add index tuple without split */
 211 #define XLOG_BTREE_INSERT_UPPER 0x10    /* same, on a non-leaf page */
 212 #define XLOG_BTREE_INSERT_META  0x20    /* same, plus update metapage */
 213 #define XLOG_BTREE_SPLIT_L              0x30    /* add index tuple with split */
 214 #define XLOG_BTREE_SPLIT_R              0x40    /* as above, new item on right */
 215 #define XLOG_BTREE_SPLIT_L_ROOT 0x50    /* add tuple with split of root */
 216 #define XLOG_BTREE_SPLIT_R_ROOT 0x60    /* as above, new item on right */
 217 #define XLOG_BTREE_DELETE               0x70    /* delete leaf index tuples for a page */
 218 #define XLOG_BTREE_DELETE_PAGE  0x80    /* delete an entire page */
 219 #define XLOG_BTREE_DELETE_PAGE_META 0x90                /* same, and update metapage */
 220 #define XLOG_BTREE_NEWROOT              0xA0    /* new root page */
 221 #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0                /* page deletion that makes
 222                                                                                                  * parent half-dead */
 223 #define XLOG_BTREE_VACUUM               0xC0    /* delete entries on a page during
 224                                                                                  * vacuum */
 225 #define XLOG_BTREE_REUSE_PAGE   0xD0    /* old page is about to be reused from
 226                                                                                  * FSM */
 227
 228 /*
 229  * All that we need to find changed index tuple
 230  */
 231 typedef struct xl_btreetid
 232 {
 233         RelFileNode node;
 234         ItemPointerData tid;            /* changed tuple id */
 235 } xl_btreetid;
 236
 237 /*
 238  * All that we need to regenerate the meta-data page
 239  */
 240 typedef struct xl_btree_metadata
 241 {
 242         BlockNumber root;
 243         uint32          level;
 244         BlockNumber fastroot;
 245         uint32          fastlevel;
 246 } xl_btree_metadata;
 247
 248 /*
 249  * This is what we need to know about simple (without split) insert.
 250  *
 251  * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
 252  * Note that INSERT_META implies it's not a leaf page.
 253  */
 254 typedef struct xl_btree_insert
 255 {
 256         xl_btreetid target;                     /* inserted tuple id */
 257         /* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
 258         /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
 259         /* INDEX TUPLE FOLLOWS AT END OF STRUCT */
 260 } xl_btree_insert;
 261
 262 #define SizeOfBtreeInsert       (offsetof(xl_btreetid, tid) + SizeOfIptrData)
 263
 264 /*
 265  * On insert with split, we save all the items going into the right sibling
 266  * so that we can restore it completely from the log record.  This way takes
 267  * less xlog space than the normal approach, because if we did it standardly,
 268  * XLogInsert would almost always think the right page is new and store its
 269  * whole page image.  The left page, however, is handled in the normal
 270  * incremental-update fashion.
 271  *
 272  * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
 273  * The _L and _R variants indicate whether the inserted tuple went into the
 274  * left or right split page (and thus, whether newitemoff and the new item
 275  * are stored or not).  The _ROOT variants indicate that we are splitting
 276  * the root page, and thus that a newroot record rather than an insert or
 277  * split record should follow.  Note that a split record never carries a
 278  * metapage update --- we'll do that in the parent-level update.
 279  */
 280 typedef struct xl_btree_split
 281 {
 282         RelFileNode node;
 283         BlockNumber leftsib;            /* orig page / new left page */
 284         BlockNumber rightsib;           /* new right page */
 285         BlockNumber rnext;                      /* next block (orig page's rightlink) */
 286         uint32          level;                  /* tree level of page being split */
 287         OffsetNumber firstright;        /* first item moved to right page */
 288
 289         /*
 290          * If level > 0, BlockIdData downlink follows.  (We use BlockIdData rather
 291          * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
 292          * aligned.)
 293          *
 294          * If level > 0, an IndexTuple representing the HIKEY of the left page
 295          * follows.  We don't need this on leaf pages, because it's the same as
 296          * the leftmost key in the new right page.      Also, it's suppressed if
 297          * XLogInsert chooses to store the left page's whole page image.
 298          *
 299          * In the _L variants, next are OffsetNumber newitemoff and the new item.
 300          * (In the _R variants, the new item is one of the right page's tuples.)
 301          * The new item, but not newitemoff, is suppressed if XLogInsert chooses
 302          * to store the left page's whole page image.
 303          *
 304          * Last are the right page's tuples in the form used by _bt_restore_page.
 305          */
 306 } xl_btree_split;
 307
 308 #define SizeOfBtreeSplit        (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
 309
 310 /*
 311  * This is what we need to know about delete of individual leaf index tuples.
 312  * The WAL record can represent deletion of any number of index tuples on a
 313  * single index page when *not* executed by VACUUM.
 314  */
 315 typedef struct xl_btree_delete
 316 {
 317         RelFileNode node;                       /* RelFileNode of the index */
 318         BlockNumber block;
 319         RelFileNode hnode;                      /* RelFileNode of the heap the index currently
 320                                                                  * points at */
 321         int                     nitems;
 322
 323         /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
 324 } xl_btree_delete;
 325
 326 #define SizeOfBtreeDelete       (offsetof(xl_btree_delete, nitems) + sizeof(int))
 327
 328 /*
 329  * This is what we need to know about page reuse within btree.
 330  */
 331 typedef struct xl_btree_reuse_page
 332 {
 333         RelFileNode node;
 334         BlockNumber block;
 335         TransactionId latestRemovedXid;
 336 } xl_btree_reuse_page;
 337
 338 #define SizeOfBtreeReusePage    (sizeof(xl_btree_reuse_page))
 339
 340 /*
 341  * This is what we need to know about vacuum of individual leaf index tuples.
 342  * The WAL record can represent deletion of any number of index tuples on a
 343  * single index page when executed by VACUUM.
 344  *
 345  * The correctness requirement for applying these changes during recovery is
 346  * that we must do one of these two things for every block in the index:
 347  *              * lock the block for cleanup and apply any required changes
 348  *              * EnsureBlockUnpinned()
 349  * The purpose of this is to ensure that no index scans started before we
 350  * finish scanning the index are still running by the time we begin to remove
 351  * heap tuples.
 352  *
 353  * Any changes to any one block are registered on just one WAL record. All
 354  * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
 355  * starting from the last block vacuumed through until this one. Individual
 356  * block numbers aren't given.
 357  *
 358  * Note that the *last* WAL record in any vacuum of an index is allowed to
 359  * have a zero length array of offsets. Earlier records must have at least one.
 360  */
 361 typedef struct xl_btree_vacuum
 362 {
 363         RelFileNode node;
 364         BlockNumber block;
 365         BlockNumber lastBlockVacuumed;
 366
 367         /* TARGET OFFSET NUMBERS FOLLOW */
 368 } xl_btree_vacuum;
 369
 370 #define SizeOfBtreeVacuum       (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
 371
 372 /*
 373  * This is what we need to know about deletion of a btree page.  The target
 374  * identifies the tuple removed from the parent page (note that we remove
 375  * this tuple's downlink and the *following* tuple's key).      Note we do not
 376  * store any content for the deleted page --- it is just rewritten as empty
 377  * during recovery, apart from resetting the btpo.xact.
 378  */
 379 typedef struct xl_btree_delete_page
 380 {
 381         xl_btreetid target;                     /* deleted tuple id in parent page */
 382         BlockNumber deadblk;            /* child block being deleted */
 383         BlockNumber leftblk;            /* child block's left sibling, if any */
 384         BlockNumber rightblk;           /* child block's right sibling */
 385         TransactionId btpo_xact;        /* value of btpo.xact for use in recovery */
 386         /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
 387 } xl_btree_delete_page;
 388
 389 #define SizeOfBtreeDeletePage   (offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))
 390
 391 /*
 392  * New root log record.  There are zero tuples if this is to establish an
 393  * empty root, or two if it is the result of splitting an old root.
 394  *
 395  * Note that although this implies rewriting the metadata page, we don't need
 396  * an xl_btree_metadata record --- the rootblk and level are sufficient.
 397  */
 398 typedef struct xl_btree_newroot
 399 {
 400         RelFileNode node;
 401         BlockNumber rootblk;            /* location of new root */
 402         uint32          level;                  /* its tree level */
 403         /* 0 or 2 INDEX TUPLES FOLLOW AT END OF STRUCT */
 404 } xl_btree_newroot;
 405
 406 #define SizeOfBtreeNewroot      (offsetof(xl_btree_newroot, level) + sizeof(uint32))
 407
 408
 409 /*
 410  *      Operator strategy numbers for B-tree have been moved to access/skey.h,
 411  *      because many places need to use them in ScanKeyInit() calls.
 412  *
 413  *      The strategy numbers are chosen so that we can commute them by
 414  *      subtraction, thus:
 415  */
 416 #define BTCommuteStrategyNumber(strat)  (BTMaxStrategyNumber + 1 - (strat))
 417
 418 /*
 419  *      When a new operator class is declared, we require that the user
 420  *      supply us with an amproc procedure (BTORDER_PROC) for determining
 421  *      whether, for two keys a and b, a < b, a = b, or a > b.  This routine
 422  *      must return < 0, 0, > 0, respectively, in these three cases.  (It must
 423  *      not return INT_MIN, since we may negate the result before using it.)
 424  *
 425  *      To facilitate accelerated sorting, an operator class may choose to
 426  *      offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
 427  *      src/include/utils/sortsupport.h.
 428  */
 429
 430 #define BTORDER_PROC            1
 431 #define BTSORTSUPPORT_PROC      2
 432
 433 /*
 434  *      We need to be able to tell the difference between read and write
 435  *      requests for pages, in order to do locking correctly.
 436  */
 437
 438 #define BT_READ                 BUFFER_LOCK_SHARE
 439 #define BT_WRITE                BUFFER_LOCK_EXCLUSIVE
 440
 441 /*
 442  *      BTStackData -- As we descend a tree, we push the (location, downlink)
 443  *      pairs from internal pages onto a private stack.  If we split a
 444  *      leaf, we use this stack to walk back up the tree and insert data
 445  *      into parent pages (and possibly to split them, too).  Lehman and
 446  *      Yao's update algorithm guarantees that under no circumstances can
 447  *      our private stack give us an irredeemably bad picture up the tree.
 448  *      Again, see the paper for details.
 449  */
 450
 451 typedef struct BTStackData
 452 {
 453         BlockNumber bts_blkno;
 454         OffsetNumber bts_offset;
 455         IndexTupleData bts_btentry;
 456         struct BTStackData *bts_parent;
 457 } BTStackData;
 458
 459 typedef BTStackData *BTStack;
 460
 461 /*
 462  * BTScanOpaqueData is the btree-private state needed for an indexscan.
 463  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
 464  * details of the preprocessing), information about the current location
 465  * of the scan, and information about the marked location, if any.      (We use
 466  * BTScanPosData to represent the data needed for each of current and marked
 467  * locations.)  In addition we can remember some known-killed index entries
 468  * that must be marked before we can move off the current page.
 469  *
 470  * Index scans work a page at a time: we pin and read-lock the page, identify
 471  * all the matching items on the page and save them in BTScanPosData, then
 472  * release the read-lock while returning the items to the caller for
 473  * processing.  This approach minimizes lock/unlock traffic.  Note that we
 474  * keep the pin on the index page until the caller is done with all the items
 475  * (this is needed for VACUUM synchronization, see nbtree/README).      When we
 476  * are ready to step to the next page, if the caller has told us any of the
 477  * items were killed, we re-lock the page to mark them killed, then unlock.
 478  * Finally we drop the pin and step to the next page in the appropriate
 479  * direction.
 480  *
 481  * If we are doing an index-only scan, we save the entire IndexTuple for each
 482  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
 483  * into a separate workspace array; each BTScanPosItem stores its tuple's
 484  * offset within that array.
 485  */
 486
 487 typedef struct BTScanPosItem    /* what we remember about each match */
 488 {
 489         ItemPointerData heapTid;        /* TID of referenced heap item */
 490         OffsetNumber indexOffset;       /* index item's location within page */
 491         LocationIndex tupleOffset;      /* IndexTuple's offset in workspace, if any */
 492 } BTScanPosItem;
 493
 494 typedef struct BTScanPosData
 495 {
 496         Buffer          buf;                    /* if valid, the buffer is pinned */
 497
 498         BlockNumber nextPage;           /* page's right link when we scanned it */
 499
 500         /*
 501          * moreLeft and moreRight track whether we think there may be matching
 502          * index entries to the left and right of the current page, respectively.
 503          * We can clear the appropriate one of these flags when _bt_checkkeys()
 504          * returns continuescan = false.
 505          */
 506         bool            moreLeft;
 507         bool            moreRight;
 508
 509         /*
 510          * If we are doing an index-only scan, nextTupleOffset is the first free
 511          * location in the associated tuple storage workspace.
 512          */
 513         int                     nextTupleOffset;
 514
 515         /*
 516          * The items array is always ordered in index order (ie, increasing
 517          * indexoffset).  When scanning backwards it is convenient to fill the
 518          * array back-to-front, so we start at the last slot and fill downwards.
 519          * Hence we need both a first-valid-entry and a last-valid-entry counter.
 520          * itemIndex is a cursor showing which entry was last returned to caller.
 521          */
 522         int                     firstItem;              /* first valid index in items[] */
 523         int                     lastItem;               /* last valid index in items[] */
 524         int                     itemIndex;              /* current index in items[] */
 525
 526         BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
 527 } BTScanPosData;
 528
 529 typedef BTScanPosData *BTScanPos;
 530
 531 #define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
 532
 533 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
 534 typedef struct BTArrayKeyInfo
 535 {
 536         int                     scan_key;               /* index of associated key in arrayKeyData */
 537         int                     cur_elem;               /* index of current element in elem_values */
 538         int                     mark_elem;              /* index of marked element in elem_values */
 539         int                     num_elems;              /* number of elems in current array value */
 540         Datum      *elem_values;        /* array of num_elems Datums */
 541 } BTArrayKeyInfo;
 542
 543 typedef struct BTScanOpaqueData
 544 {
 545         /* these fields are set by _bt_preprocess_keys(): */
 546         bool            qual_ok;                /* false if qual can never be satisfied */
 547         int                     numberOfKeys;   /* number of preprocessed scan keys */
 548         ScanKey         keyData;                /* array of preprocessed scan keys */
 549
 550         /* workspace for SK_SEARCHARRAY support */
 551         ScanKey         arrayKeyData;   /* modified copy of scan->keyData */
 552         int                     numArrayKeys;   /* number of equality-type array keys (-1 if
 553                                                                  * there are any unsatisfiable array keys) */
 554         BTArrayKeyInfo *arrayKeys;      /* info about each equality-type array key */
 555         MemoryContext arrayContext; /* scan-lifespan context for array data */
 556
 557         /* info about killed items if any (killedItems is NULL if never used) */
 558         int                *killedItems;        /* currPos.items indexes of killed items */
 559         int                     numKilled;              /* number of currently stored items */
 560
 561         /*
 562          * If we are doing an index-only scan, these are the tuple storage
 563          * workspaces for the currPos and markPos respectively.  Each is of size
 564          * BLCKSZ, so it can hold as much as a full page's worth of tuples.
 565          */
 566         char       *currTuples;         /* tuple storage for currPos */
 567         char       *markTuples;         /* tuple storage for markPos */
 568
 569         /*
 570          * If the marked position is on the same page as current position, we
 571          * don't use markPos, but just keep the marked itemIndex in markItemIndex
 572          * (all the rest of currPos is valid for the mark position). Hence, to
 573          * determine if there is a mark, first look at markItemIndex, then at
 574          * markPos.
 575          */
 576         int                     markItemIndex;  /* itemIndex, or -1 if not valid */
 577
 578         /* keep these last in struct for efficiency */
 579         BTScanPosData currPos;          /* current position data */
 580         BTScanPosData markPos;          /* marked position, if any */
 581 } BTScanOpaqueData;
 582
 583 typedef BTScanOpaqueData *BTScanOpaque;
 584
 585 /*
 586  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
 587  * to use bits 16-31 (see skey.h).      The uppermost bits are copied from the
 588  * index's indoption[] array entry for the index attribute.
 589  */
 590 #define SK_BT_REQFWD    0x00010000              /* required to continue forward scan */
 591 #define SK_BT_REQBKWD   0x00020000              /* required to continue backward scan */
 592 #define SK_BT_INDOPTION_SHIFT  24               /* must clear the above bits */
 593 #define SK_BT_DESC                      (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 594 #define SK_BT_NULLS_FIRST       (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 595
 596 /*
 597  * prototypes for functions in nbtree.c (external entry points for btree)
 598  */
 599 extern Datum btbuild(PG_FUNCTION_ARGS);
 600 extern Datum btbuildempty(PG_FUNCTION_ARGS);
 601 extern Datum btinsert(PG_FUNCTION_ARGS);
 602 extern Datum btbeginscan(PG_FUNCTION_ARGS);
 603 extern Datum btgettuple(PG_FUNCTION_ARGS);
 604 extern Datum btgetbitmap(PG_FUNCTION_ARGS);
 605 extern Datum btrescan(PG_FUNCTION_ARGS);
 606 extern Datum btendscan(PG_FUNCTION_ARGS);
 607 extern Datum btmarkpos(PG_FUNCTION_ARGS);
 608 extern Datum btrestrpos(PG_FUNCTION_ARGS);
 609 extern Datum btbulkdelete(PG_FUNCTION_ARGS);
 610 extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
 611 extern Datum btcanreturn(PG_FUNCTION_ARGS);
 612 extern Datum btoptions(PG_FUNCTION_ARGS);
 613
 614 /*
 615  * prototypes for functions in nbtinsert.c
 616  */
 617 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
 618                          IndexUniqueCheck checkUnique, Relation heapRel);
 619 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
 620 extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 621                                   BTStack stack, bool is_root, bool is_only);
 622
 623 /*
 624  * prototypes for functions in nbtpage.c
 625  */
 626 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
 627 extern Buffer _bt_getroot(Relation rel, int access);
 628 extern Buffer _bt_gettrueroot(Relation rel);
 629 extern int      _bt_getrootheight(Relation rel);
 630 extern void _bt_checkpage(Relation rel, Buffer buf);
 631 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 632 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
 633                                  BlockNumber blkno, int access);
 634 extern void _bt_relbuf(Relation rel, Buffer buf);
 635 extern void _bt_pageinit(Page page, Size size);
 636 extern bool _bt_page_recyclable(Page page);
 637 extern void _bt_delitems_delete(Relation rel, Buffer buf,
 638                                         OffsetNumber *itemnos, int nitems, Relation heapRel);
 639 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 640                                         OffsetNumber *itemnos, int nitems,
 641                                         BlockNumber lastBlockVacuumed);
 642 extern int      _bt_pagedel(Relation rel, Buffer buf, BTStack stack);
 643
 644 /*
 645  * prototypes for functions in nbtsearch.c
 646  */
 647 extern BTStack _bt_search(Relation rel,
 648                    int keysz, ScanKey scankey, bool nextkey,
 649                    Buffer *bufP, int access);
 650 extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
 651                           ScanKey scankey, bool nextkey, int access);
 652 extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
 653                         ScanKey scankey, bool nextkey);
 654 extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 655                         Page page, OffsetNumber offnum);
 656 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 657 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 658 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
 659
 660 /*
 661  * prototypes for functions in nbtutils.c
 662  */
 663 extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
 664 extern ScanKey _bt_mkscankey_nodata(Relation rel);
 665 extern void _bt_freeskey(ScanKey skey);
 666 extern void _bt_freestack(BTStack stack);
 667 extern void _bt_preprocess_array_keys(IndexScanDesc scan);
 668 extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
 669 extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
 670 extern void _bt_mark_array_keys(IndexScanDesc scan);
 671 extern void _bt_restore_array_keys(IndexScanDesc scan);
 672 extern void _bt_preprocess_keys(IndexScanDesc scan);
 673 extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
 674                           Page page, OffsetNumber offnum,
 675                           ScanDirection dir, bool *continuescan);
 676 extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
 677 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 678 extern BTCycleId _bt_start_vacuum(Relation rel);
 679 extern void _bt_end_vacuum(Relation rel);
 680 extern void _bt_end_vacuum_callback(int code, Datum arg);
 681 extern Size BTreeShmemSize(void);
 682 extern void BTreeShmemInit(void);
 683
 684 /*
 685  * prototypes for functions in nbtsort.c
 686  */
 687 typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
 688
 689 extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
 690                           bool isunique, bool isdead);
 691 extern void _bt_spooldestroy(BTSpool *btspool);
 692 extern void _bt_spool(IndexTuple itup, BTSpool *btspool);
 693 extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
 694
 695 /*
 696  * prototypes for functions in nbtxlog.c
 697  */
 698 extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
 699 extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
 700 extern void btree_xlog_startup(void);
 701 extern void btree_xlog_cleanup(void);
 702 extern bool btree_safe_restartpoint(void);
 703
 704 #endif   /* NBTREE_H */