From 8da9a226369e9ceec7cef1ab7a16cdc0adb4d657 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 14 Feb 2017 15:37:59 -0500 Subject: [PATCH] Split index xlog headers from other private index headers. The xlog-specific headers need to be included in both frontend code - specifically, pg_waldump - and the backend, but the remainder of the private headers for each index are only needed by the backend. By splitting the xlog stuff out into separate headers, pg_waldump pulls in fewer backend headers, which is a good thing. Patch by me, reviewed by Michael Paquier and Andres Freund, per a complaint from Dilip Kumar. Discussion: http://postgr.es/m/CA+TgmoZ=F=GkxV0YEv-A8tb+AEGy_Qa7GSiJ8deBKFATnzfEug@mail.gmail.com --- src/backend/access/gin/ginbtree.c | 1 + src/backend/access/gin/gindatapage.c | 1 + src/backend/access/gin/ginentrypage.c | 1 + src/backend/access/gin/ginfast.c | 1 + src/backend/access/gin/gininsert.c | 1 + src/backend/access/gin/ginutil.c | 1 + src/backend/access/gin/ginvacuum.c | 1 + src/backend/access/gin/ginxlog.c | 1 + src/backend/access/gist/gistbuild.c | 1 + src/backend/access/gist/gistxlog.c | 1 + src/backend/access/nbtree/nbtinsert.c | 1 + src/backend/access/nbtree/nbtpage.c | 1 + src/backend/access/nbtree/nbtxlog.c | 1 + src/backend/access/rmgrdesc/gindesc.c | 2 +- src/backend/access/rmgrdesc/gistdesc.c | 2 +- src/backend/access/rmgrdesc/nbtdesc.c | 2 +- src/backend/access/rmgrdesc/spgdesc.c | 2 +- src/backend/access/spgist/spgdoinsert.c | 1 + src/backend/access/spgist/spginsert.c | 1 + src/backend/access/spgist/spgvacuum.c | 1 + src/backend/access/spgist/spgxlog.c | 1 + src/backend/access/transam/rmgr.c | 8 +- src/bin/pg_waldump/rmgrdesc.c | 8 +- src/include/access/gin.h | 8 - src/include/access/gin_private.h | 509 +----------------------- src/include/access/ginblock.h | 329 +++++++++++++++ src/include/access/ginxlog.h | 217 ++++++++++ src/include/access/gist_private.h | 55 +-- src/include/access/gistxlog.h | 69 ++++ src/include/access/hash_xlog.h | 2 +- src/include/access/nbtree.h | 234 ----------- src/include/access/nbtxlog.h | 255 ++++++++++++ src/include/access/spgist.h | 8 - src/include/access/spgist_private.h | 230 ----------- src/include/access/spgxlog.h | 257 ++++++++++++ 35 files changed, 1159 insertions(+), 1055 deletions(-) create mode 100644 src/include/access/ginblock.h create mode 100644 src/include/access/ginxlog.h create mode 100644 src/include/access/gistxlog.h create mode 100644 src/include/access/nbtxlog.h create mode 100644 src/include/access/spgxlog.h diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index e5e83b9331..538ad5bb58 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "utils/memutils.h" diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 3ede711351..ad62d4e0e9 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "lib/ilist.h" #include "miscadmin.h" diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 446be55f69..8c9859ce8e 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "utils/rel.h" diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 85031e2921..0d5bb70cc9 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -19,6 +19,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "access/xlog.h" #include "commands/vacuum.h" diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 3d3b9e0840..d90faae65d 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "catalog/index.h" #include "miscadmin.h" diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 02d920bb9d..a98d4fc397 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/reloptions.h" #include "access/xloginsert.h" #include "catalog/pg_collation.h" diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index ddd168bcc6..c9ccfeece8 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xloginsert.h" #include "commands/vacuum.h" #include "miscadmin.h" diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 2995e7b06a..7ba04e324f 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -15,6 +15,7 @@ #include "access/bufmask.h" #include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xlogutils.h" #include "utils/memutils.h" diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index b65926f97a..f1f08bb3d8 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -18,6 +18,7 @@ #include "access/genam.h" #include "access/gist_private.h" +#include "access/gistxlog.h" #include "access/xloginsert.h" #include "catalog/index.h" #include "miscadmin.h" diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index cbda9e705c..4f4fe8fab5 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -15,6 +15,7 @@ #include "access/bufmask.h" #include "access/gist_private.h" +#include "access/gistxlog.h" #include "access/xloginsert.h" #include "access/xlogutils.h" #include "utils/memutils.h" diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 883d70da11..6dca8109fd 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -17,6 +17,7 @@ #include "access/heapam.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/transam.h" #include "access/xloginsert.h" #include "miscadmin.h" diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index da74f79b40..f815fd40b2 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -23,6 +23,7 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index a9ca279d81..ac60db0d49 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -17,6 +17,7 @@ #include "access/bufmask.h" #include "access/heapam_xlog.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xlogutils.h" diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index d4ed7f9c0a..b22fdd48f3 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/gin_private.h" +#include "access/ginxlog.h" #include "access/xlogutils.h" #include "lib/stringinfo.h" #include "storage/relfilenode.h" diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index bf34578bf7..dc0506913c 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/gist_private.h" +#include "access/gistxlog.h" #include "lib/stringinfo.h" #include "storage/relfilenode.h" diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 96ec936a88..fbde9d6555 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/nbtree.h" +#include "access/nbtxlog.h" void btree_desc(StringInfo buf, XLogReaderState *record) diff --git a/src/backend/access/rmgrdesc/spgdesc.c b/src/backend/access/rmgrdesc/spgdesc.c index 1a229d94f3..24d6cb58fd 100644 --- a/src/backend/access/rmgrdesc/spgdesc.c +++ b/src/backend/access/rmgrdesc/spgdesc.c @@ -14,7 +14,7 @@ */ #include "postgres.h" -#include "access/spgist_private.h" +#include "access/spgxlog.h" void spg_desc(StringInfo buf, XLogReaderState *record) diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c index 748e568a62..90c6534139 100644 --- a/src/backend/access/spgist/spgdoinsert.c +++ b/src/backend/access/spgist/spgdoinsert.c @@ -17,6 +17,7 @@ #include "access/genam.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "storage/bufmgr.h" diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 14f8a9ee8e..00a0ab4438 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -18,6 +18,7 @@ #include "access/genam.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "catalog/index.h" diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index cc6b000a62..cce9b3f618 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -17,6 +17,7 @@ #include "access/genam.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/transam.h" #include "access/xloginsert.h" #include "catalog/storage_xlog.h" diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 596b266ba6..c007601efd 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -16,6 +16,7 @@ #include "access/bufmask.h" #include "access/spgist_private.h" +#include "access/spgxlog.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xlogutils.h" diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index eae75242fe..9368b56c4c 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -9,15 +9,15 @@ #include "access/clog.h" #include "access/commit_ts.h" -#include "access/gin.h" -#include "access/gist_private.h" +#include "access/ginxlog.h" +#include "access/gistxlog.h" #include "access/generic_xlog.h" #include "access/hash_xlog.h" #include "access/heapam_xlog.h" #include "access/brin_xlog.h" #include "access/multixact.h" -#include "access/nbtree.h" -#include "access/spgist.h" +#include "access/nbtxlog.h" +#include "access/spgxlog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage_xlog.h" diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index dd40ba07c7..852d8ca4b1 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -12,14 +12,14 @@ #include "access/clog.h" #include "access/commit_ts.h" #include "access/generic_xlog.h" -#include "access/gin.h" -#include "access/gist_private.h" +#include "access/ginxlog.h" +#include "access/gistxlog.h" #include "access/hash_xlog.h" #include "access/heapam_xlog.h" #include "access/multixact.h" -#include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/rmgr.h" -#include "access/spgist.h" +#include "access/spgxlog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage_xlog.h" diff --git a/src/include/access/gin.h b/src/include/access/gin.h index e5d67305d9..bd9e8833de 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -73,12 +73,4 @@ extern int gin_pending_list_limit; extern void ginGetStats(Relation index, GinStatsData *stats); extern void ginUpdateStats(Relation index, const GinStatsData *stats); -/* ginxlog.c */ -extern void gin_redo(XLogReaderState *record); -extern void gin_desc(StringInfo buf, XLogReaderState *record); -extern const char *gin_identify(uint8 info); -extern void gin_xlog_startup(void); -extern void gin_xlog_cleanup(void); -extern void gin_mask(char *pagedata, BlockNumber blkno); - #endif /* GIN_H */ diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 7e1557ea85..34e7339f05 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -12,309 +12,12 @@ #include "access/amapi.h" #include "access/gin.h" +#include "access/ginblock.h" #include "access/itup.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "lib/rbtree.h" - -/* - * Page opaque data in an inverted index page. - * - * Note: GIN does not include a page ID word as do the other index types. - * This is OK because the opaque data is only 8 bytes and so can be reliably - * distinguished by size. Revisit this if the size ever increases. - * Further note: as of 9.2, SP-GiST also uses 8-byte special space, as does - * BRIN as of 9.5. This is still OK, as long as GIN isn't using all of the - * high-order bits in its flags word, because that way the flags word cannot - * match the page IDs used by SP-GiST and BRIN. - */ -typedef struct GinPageOpaqueData -{ - BlockNumber rightlink; /* next page if any */ - OffsetNumber maxoff; /* number of PostingItems on GIN_DATA & - * ~GIN_LEAF page. On GIN_LIST page, number of - * heap tuples. */ - uint16 flags; /* see bit definitions below */ -} GinPageOpaqueData; - -typedef GinPageOpaqueData *GinPageOpaque; - -#define GIN_DATA (1 << 0) -#define GIN_LEAF (1 << 1) -#define GIN_DELETED (1 << 2) -#define GIN_META (1 << 3) -#define GIN_LIST (1 << 4) -#define GIN_LIST_FULLROW (1 << 5) /* makes sense only on GIN_LIST page */ -#define GIN_INCOMPLETE_SPLIT (1 << 6) /* page was split, but parent not - * updated */ -#define GIN_COMPRESSED (1 << 7) - -/* Page numbers of fixed-location pages */ -#define GIN_METAPAGE_BLKNO (0) -#define GIN_ROOT_BLKNO (1) - -typedef struct GinMetaPageData -{ - /* - * Pointers to head and tail of pending list, which consists of GIN_LIST - * pages. These store fast-inserted entries that haven't yet been moved - * into the regular GIN structure. - */ - BlockNumber head; - BlockNumber tail; - - /* - * Free space in bytes in the pending list's tail page. - */ - uint32 tailFreeSize; - - /* - * We store both number of pages and number of heap tuples that are in the - * pending list. - */ - BlockNumber nPendingPages; - int64 nPendingHeapTuples; - - /* - * Statistics for planner use (accurate as of last VACUUM) - */ - BlockNumber nTotalPages; - BlockNumber nEntryPages; - BlockNumber nDataPages; - int64 nEntries; - - /* - * GIN version number (ideally this should have been at the front, but too - * late now. Don't move it!) - * - * Currently 2 (for indexes initialized in 9.4 or later) - * - * Version 1 (indexes initialized in version 9.1, 9.2 or 9.3), is - * compatible, but may contain uncompressed posting tree (leaf) pages and - * posting lists. They will be converted to compressed format when - * modified. - * - * Version 0 (indexes initialized in 9.0 or before) is compatible but may - * be missing null entries, including both null keys and placeholders. - * Reject full-index-scan attempts on such indexes. - */ - int32 ginVersion; -} GinMetaPageData; - -#define GIN_CURRENT_VERSION 2 - -#define GinPageGetMeta(p) \ - ((GinMetaPageData *) PageGetContents(p)) - -/* - * Macros for accessing a GIN index page's opaque data - */ -#define GinPageGetOpaque(page) ( (GinPageOpaque) PageGetSpecialPointer(page) ) - -#define GinPageIsLeaf(page) ( (GinPageGetOpaque(page)->flags & GIN_LEAF) != 0 ) -#define GinPageSetLeaf(page) ( GinPageGetOpaque(page)->flags |= GIN_LEAF ) -#define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF ) -#define GinPageIsData(page) ( (GinPageGetOpaque(page)->flags & GIN_DATA) != 0 ) -#define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA ) -#define GinPageIsList(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST) != 0 ) -#define GinPageSetList(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST ) -#define GinPageHasFullRow(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW) != 0 ) -#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW ) -#define GinPageIsCompressed(page) ( (GinPageGetOpaque(page)->flags & GIN_COMPRESSED) != 0 ) -#define GinPageSetCompressed(page) ( GinPageGetOpaque(page)->flags |= GIN_COMPRESSED ) - -#define GinPageIsDeleted(page) ( (GinPageGetOpaque(page)->flags & GIN_DELETED) != 0 ) -#define GinPageSetDeleted(page) ( GinPageGetOpaque(page)->flags |= GIN_DELETED) -#define GinPageSetNonDeleted(page) ( GinPageGetOpaque(page)->flags &= ~GIN_DELETED) -#define GinPageIsIncompleteSplit(page) ( (GinPageGetOpaque(page)->flags & GIN_INCOMPLETE_SPLIT) != 0 ) - -#define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) - -/* - * We use our own ItemPointerGet(BlockNumber|OffsetNumber) - * to avoid Asserts, since sometimes the ip_posid isn't "valid" - */ -#define GinItemPointerGetBlockNumber(pointer) \ - BlockIdGetBlockNumber(&(pointer)->ip_blkid) - -#define GinItemPointerGetOffsetNumber(pointer) \ - ((pointer)->ip_posid) - -/* - * Special-case item pointer values needed by the GIN search logic. - * MIN: sorts less than any valid item pointer - * MAX: sorts greater than any valid item pointer - * LOSSY PAGE: indicates a whole heap page, sorts after normal item - * pointers for that page - * Note that these are all distinguishable from an "invalid" item pointer - * (which is InvalidBlockNumber/0) as well as from all normal item - * pointers (which have item numbers in the range 1..MaxHeapTuplesPerPage). - */ -#define ItemPointerSetMin(p) \ - ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0) -#define ItemPointerIsMin(p) \ - (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0 && \ - GinItemPointerGetBlockNumber(p) == (BlockNumber)0) -#define ItemPointerSetMax(p) \ - ItemPointerSet((p), InvalidBlockNumber, (OffsetNumber)0xffff) -#define ItemPointerIsMax(p) \ - (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ - GinItemPointerGetBlockNumber(p) == InvalidBlockNumber) -#define ItemPointerSetLossyPage(p, b) \ - ItemPointerSet((p), (b), (OffsetNumber)0xffff) -#define ItemPointerIsLossyPage(p) \ - (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ - GinItemPointerGetBlockNumber(p) != InvalidBlockNumber) - -/* - * Posting item in a non-leaf posting-tree page - */ -typedef struct -{ - /* We use BlockIdData not BlockNumber to avoid padding space wastage */ - BlockIdData child_blkno; - ItemPointerData key; -} PostingItem; - -#define PostingItemGetBlockNumber(pointer) \ - BlockIdGetBlockNumber(&(pointer)->child_blkno) - -#define PostingItemSetBlockNumber(pointer, blockNumber) \ - BlockIdSet(&((pointer)->child_blkno), (blockNumber)) - -/* - * Category codes to distinguish placeholder nulls from ordinary NULL keys. - * Note that the datatype size and the first two code values are chosen to be - * compatible with the usual usage of bool isNull flags. - * - * GIN_CAT_EMPTY_QUERY is never stored in the index; and notice that it is - * chosen to sort before not after regular key values. - */ -typedef signed char GinNullCategory; - -#define GIN_CAT_NORM_KEY 0 /* normal, non-null key value */ -#define GIN_CAT_NULL_KEY 1 /* null key value */ -#define GIN_CAT_EMPTY_ITEM 2 /* placeholder for zero-key item */ -#define GIN_CAT_NULL_ITEM 3 /* placeholder for null item */ -#define GIN_CAT_EMPTY_QUERY (-1) /* placeholder for full-scan query */ - -/* - * Access macros for null category byte in entry tuples - */ -#define GinCategoryOffset(itup,ginstate) \ - (IndexInfoFindDataOffset((itup)->t_info) + \ - ((ginstate)->oneCol ? 0 : sizeof(int16))) -#define GinGetNullCategory(itup,ginstate) \ - (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate)))) -#define GinSetNullCategory(itup,ginstate,c) \ - (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate))) = (c)) - -/* - * Access macros for leaf-page entry tuples (see discussion in README) - */ -#define GinGetNPosting(itup) GinItemPointerGetOffsetNumber(&(itup)->t_tid) -#define GinSetNPosting(itup,n) ItemPointerSetOffsetNumber(&(itup)->t_tid,n) -#define GIN_TREE_POSTING ((OffsetNumber)0xffff) -#define GinIsPostingTree(itup) (GinGetNPosting(itup) == GIN_TREE_POSTING) -#define GinSetPostingTree(itup, blkno) ( GinSetNPosting((itup),GIN_TREE_POSTING), ItemPointerSetBlockNumber(&(itup)->t_tid, blkno) ) -#define GinGetPostingTree(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid) - -#define GIN_ITUP_COMPRESSED (1U << 31) -#define GinGetPostingOffset(itup) (GinItemPointerGetBlockNumber(&(itup)->t_tid) & (~GIN_ITUP_COMPRESSED)) -#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,(n)|GIN_ITUP_COMPRESSED) -#define GinGetPosting(itup) ((Pointer) ((char*)(itup) + GinGetPostingOffset(itup))) -#define GinItupIsCompressed(itup) ((GinItemPointerGetBlockNumber(&(itup)->t_tid) & GIN_ITUP_COMPRESSED) != 0) - -/* - * Maximum size of an item on entry tree page. Make sure that we fit at least - * three items on each page. (On regular B-tree indexes, we must fit at least - * three items: two data items and the "high key". In GIN entry tree, we don't - * currently store the high key explicitly, we just use the rightmost item on - * the page, so it would actually be enough to fit two items.) - */ -#define GinMaxItemSize \ - Min(INDEX_SIZE_MASK, \ - MAXALIGN_DOWN(((BLCKSZ - \ - MAXALIGN(SizeOfPageHeaderData + 3 * sizeof(ItemIdData)) - \ - MAXALIGN(sizeof(GinPageOpaqueData))) / 3))) - -/* - * Access macros for non-leaf entry tuples - */ -#define GinGetDownlink(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid) -#define GinSetDownlink(itup,blkno) ItemPointerSet(&(itup)->t_tid, blkno, InvalidOffsetNumber) - - -/* - * Data (posting tree) pages - * - * Posting tree pages don't store regular tuples. Non-leaf pages contain - * PostingItems, which are pairs of ItemPointers and child block numbers. - * Leaf pages contain GinPostingLists and an uncompressed array of item - * pointers. - * - * In a leaf page, the compressed posting lists are stored after the regular - * page header, one after each other. Although we don't store regular tuples, - * pd_lower is used to indicate the end of the posting lists. After that, free - * space follows. This layout is compatible with the "standard" heap and - * index page layout described in bufpage.h, so that we can e.g set buffer_std - * when writing WAL records. - * - * In the special space is the GinPageOpaque struct. - */ -#define GinDataLeafPageGetPostingList(page) \ - (GinPostingList *) ((PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData)))) -#define GinDataLeafPageGetPostingListSize(page) \ - (((PageHeader) page)->pd_lower - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(ItemPointerData))) - -#define GinDataLeafPageIsEmpty(page) \ - (GinPageIsCompressed(page) ? (GinDataLeafPageGetPostingListSize(page) == 0) : (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)) - -#define GinDataLeafPageGetFreeSpace(page) PageGetExactFreeSpace(page) - -#define GinDataPageGetRightBound(page) ((ItemPointer) PageGetContents(page)) -/* - * Pointer to the data portion of a posting tree page. For internal pages, - * that's the beginning of the array of PostingItems. For compressed leaf - * pages, the first compressed posting list. For uncompressed (pre-9.4) leaf - * pages, it's the beginning of the ItemPointer array. - */ -#define GinDataPageGetData(page) \ - (PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData))) -/* non-leaf pages contain PostingItems */ -#define GinDataPageGetPostingItem(page, i) \ - ((PostingItem *) (GinDataPageGetData(page) + ((i)-1) * sizeof(PostingItem))) - -/* - * Note: there is no GinDataPageGetDataSize macro, because before version - * 9.4, we didn't set pd_lower on data pages. There can be pages in the index - * that were binary-upgraded from earlier versions and still have an invalid - * pd_lower, so we cannot trust it in general. Compressed posting tree leaf - * pages are new in 9.4, however, so we can trust them; see - * GinDataLeafPageGetPostingListSize. - */ -#define GinDataPageSetDataSize(page, size) \ - { \ - Assert(size <= GinDataPageMaxDataSize); \ - ((PageHeader) page)->pd_lower = (size) + MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(ItemPointerData)); \ - } - -#define GinNonLeafDataPageGetFreeSpace(page) \ - (GinDataPageMaxDataSize - \ - GinPageGetOpaque(page)->maxoff * sizeof(PostingItem)) - -#define GinDataPageMaxDataSize \ - (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ - - MAXALIGN(sizeof(ItemPointerData)) \ - - MAXALIGN(sizeof(GinPageOpaqueData))) - -/* - * List pages - */ -#define GinListPageSize \ - ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) - /* * Storage type for GIN's reloptions */ @@ -380,216 +83,6 @@ typedef struct GinState } GinState; -/* - * A compressed posting list. - * - * Note: This requires 2-byte alignment. - */ -typedef struct -{ - ItemPointerData first; /* first item in this posting list (unpacked) */ - uint16 nbytes; /* number of bytes that follow */ - unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]; /* varbyte encoded items */ -} GinPostingList; - -#define SizeOfGinPostingList(plist) (offsetof(GinPostingList, bytes) + SHORTALIGN((plist)->nbytes) ) -#define GinNextPostingListSegment(cur) ((GinPostingList *) (((char *) (cur)) + SizeOfGinPostingList((cur)))) - - -/* XLog stuff */ - -#define XLOG_GIN_CREATE_INDEX 0x00 - -#define XLOG_GIN_CREATE_PTREE 0x10 - -typedef struct ginxlogCreatePostingTree -{ - uint32 size; - /* A compressed posting list follows */ -} ginxlogCreatePostingTree; - -/* - * The format of the insertion record varies depending on the page type. - * ginxlogInsert is the common part between all variants. - * - * Backup Blk 0: target page - * Backup Blk 1: left child, if this insertion finishes an incomplete split - */ - -#define XLOG_GIN_INSERT 0x20 - -typedef struct -{ - uint16 flags; /* GIN_INSERT_ISLEAF and/or GIN_INSERT_ISDATA */ - - /* - * FOLLOWS: - * - * 1. if not leaf page, block numbers of the left and right child pages - * whose split this insertion finishes, as BlockIdData[2] (beware of - * adding fields in this struct that would make them not 16-bit aligned) - * - * 2. a ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending - * on tree type. - * - * NB: the below structs are only 16-bit aligned when appended to a - * ginxlogInsert struct! Beware of adding fields to them that require - * stricter alignment. - */ -} ginxlogInsert; - -typedef struct -{ - OffsetNumber offset; - bool isDelete; - IndexTupleData tuple; /* variable length */ -} ginxlogInsertEntry; - - -typedef struct -{ - uint16 nactions; - - /* Variable number of 'actions' follow */ -} ginxlogRecompressDataLeaf; - -/* - * Note: this struct is currently not used in code, and only acts as - * documentation. The WAL record format is as specified here, but the code - * uses straight access through a Pointer and memcpy to read/write these. - */ -typedef struct -{ - uint8 segno; /* segment this action applies to */ - char type; /* action type (see below) */ - - /* - * Action-specific data follows. For INSERT and REPLACE actions that is a - * GinPostingList struct. For ADDITEMS, a uint16 for the number of items - * added, followed by the items themselves as ItemPointers. DELETE actions - * have no further data. - */ -} ginxlogSegmentAction; - -/* Action types */ -#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */ -#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */ -#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */ -#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */ -#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */ - -typedef struct -{ - OffsetNumber offset; - PostingItem newitem; -} ginxlogInsertDataInternal; - -/* - * Backup Blk 0: new left page (= original page, if not root split) - * Backup Blk 1: new right page - * Backup Blk 2: original page / new root page, if root split - * Backup Blk 3: left child, if this insertion completes an earlier split - */ -#define XLOG_GIN_SPLIT 0x30 - -typedef struct ginxlogSplit -{ - RelFileNode node; - BlockNumber rrlink; /* right link, or root's blocknumber if root - * split */ - BlockNumber leftChildBlkno; /* valid on a non-leaf split */ - BlockNumber rightChildBlkno; - uint16 flags; /* see below */ -} ginxlogSplit; - -/* - * Flags used in ginxlogInsert and ginxlogSplit records - */ -#define GIN_INSERT_ISDATA 0x01 /* for both insert and split records */ -#define GIN_INSERT_ISLEAF 0x02 /* ditto */ -#define GIN_SPLIT_ROOT 0x04 /* only for split records */ - -/* - * Vacuum simply WAL-logs the whole page, when anything is modified. This - * is functionally identical to heap_newpage records, but is kept separate for - * debugging purposes. (When inspecting the WAL stream, it's easier to see - * what's going on when GIN vacuum records are marked as such, not as heap - * records.) This is currently only used for entry tree leaf pages. - */ -#define XLOG_GIN_VACUUM_PAGE 0x40 - -/* - * Vacuuming posting tree leaf page is WAL-logged like recompression caused - * by insertion. - */ -#define XLOG_GIN_VACUUM_DATA_LEAF_PAGE 0x90 - -typedef struct ginxlogVacuumDataLeafPage -{ - ginxlogRecompressDataLeaf data; -} ginxlogVacuumDataLeafPage; - -/* - * Backup Blk 0: deleted page - * Backup Blk 1: parent - * Backup Blk 2: left sibling - */ -#define XLOG_GIN_DELETE_PAGE 0x50 - -typedef struct ginxlogDeletePage -{ - OffsetNumber parentOffset; - BlockNumber rightLink; -} ginxlogDeletePage; - -#define XLOG_GIN_UPDATE_META_PAGE 0x60 - -/* - * Backup Blk 0: metapage - * Backup Blk 1: tail page - */ -typedef struct ginxlogUpdateMeta -{ - RelFileNode node; - GinMetaPageData metadata; - BlockNumber prevTail; - BlockNumber newRightlink; - int32 ntuples; /* if ntuples > 0 then metadata.tail was - * updated with that many tuples; else new sub - * list was inserted */ - /* array of inserted tuples follows */ -} ginxlogUpdateMeta; - -#define XLOG_GIN_INSERT_LISTPAGE 0x70 - -typedef struct ginxlogInsertListPage -{ - BlockNumber rightlink; - int32 ntuples; - /* array of inserted tuples follows */ -} ginxlogInsertListPage; - -/* - * Backup Blk 0: metapage - * Backup Blk 1 to (ndeleted + 1): deleted pages - */ - -#define XLOG_GIN_DELETE_LISTPAGE 0x80 - -/* - * The WAL record for deleting list pages must contain a block reference to - * all the deleted pages, so the number of pages that can be deleted in one - * record is limited by XLR_MAX_BLOCK_ID. (block_id 0 is used for the - * metapage.) - */ -#define GIN_NDELETE_AT_ONCE Min(16, XLR_MAX_BLOCK_ID - 1) -typedef struct ginxlogDeleteListPages -{ - GinMetaPageData metadata; - int32 ndeleted; -} ginxlogDeleteListPages; - - /* ginutil.c */ extern bytea *ginoptions(Datum reloptions, bool validate); extern void initGinState(GinState *state, Relation index); diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h new file mode 100644 index 0000000000..a3fb0560dd --- /dev/null +++ b/src/include/access/ginblock.h @@ -0,0 +1,329 @@ +/*-------------------------------------------------------------------------- + * ginblock.h + * details of structures stored in GIN index blocks + * + * Copyright (c) 2006-2017, PostgreSQL Global Development Group + * + * src/include/access/ginblock.h + *-------------------------------------------------------------------------- + */ +#ifndef GINBLOCK_H +#define GINBLOCK_H + +#include "storage/block.h" +#include "storage/itemptr.h" +#include "storage/off.h" + +/* + * Page opaque data in an inverted index page. + * + * Note: GIN does not include a page ID word as do the other index types. + * This is OK because the opaque data is only 8 bytes and so can be reliably + * distinguished by size. Revisit this if the size ever increases. + * Further note: as of 9.2, SP-GiST also uses 8-byte special space, as does + * BRIN as of 9.5. This is still OK, as long as GIN isn't using all of the + * high-order bits in its flags word, because that way the flags word cannot + * match the page IDs used by SP-GiST and BRIN. + */ +typedef struct GinPageOpaqueData +{ + BlockNumber rightlink; /* next page if any */ + OffsetNumber maxoff; /* number of PostingItems on GIN_DATA & + * ~GIN_LEAF page. On GIN_LIST page, number of + * heap tuples. */ + uint16 flags; /* see bit definitions below */ +} GinPageOpaqueData; + +typedef GinPageOpaqueData *GinPageOpaque; + +#define GIN_DATA (1 << 0) +#define GIN_LEAF (1 << 1) +#define GIN_DELETED (1 << 2) +#define GIN_META (1 << 3) +#define GIN_LIST (1 << 4) +#define GIN_LIST_FULLROW (1 << 5) /* makes sense only on GIN_LIST page */ +#define GIN_INCOMPLETE_SPLIT (1 << 6) /* page was split, but parent not + * updated */ +#define GIN_COMPRESSED (1 << 7) + +/* Page numbers of fixed-location pages */ +#define GIN_METAPAGE_BLKNO (0) +#define GIN_ROOT_BLKNO (1) + +typedef struct GinMetaPageData +{ + /* + * Pointers to head and tail of pending list, which consists of GIN_LIST + * pages. These store fast-inserted entries that haven't yet been moved + * into the regular GIN structure. + */ + BlockNumber head; + BlockNumber tail; + + /* + * Free space in bytes in the pending list's tail page. + */ + uint32 tailFreeSize; + + /* + * We store both number of pages and number of heap tuples that are in the + * pending list. + */ + BlockNumber nPendingPages; + int64 nPendingHeapTuples; + + /* + * Statistics for planner use (accurate as of last VACUUM) + */ + BlockNumber nTotalPages; + BlockNumber nEntryPages; + BlockNumber nDataPages; + int64 nEntries; + + /* + * GIN version number (ideally this should have been at the front, but too + * late now. Don't move it!) + * + * Currently 2 (for indexes initialized in 9.4 or later) + * + * Version 1 (indexes initialized in version 9.1, 9.2 or 9.3), is + * compatible, but may contain uncompressed posting tree (leaf) pages and + * posting lists. They will be converted to compressed format when + * modified. + * + * Version 0 (indexes initialized in 9.0 or before) is compatible but may + * be missing null entries, including both null keys and placeholders. + * Reject full-index-scan attempts on such indexes. + */ + int32 ginVersion; +} GinMetaPageData; + +#define GIN_CURRENT_VERSION 2 + +#define GinPageGetMeta(p) \ + ((GinMetaPageData *) PageGetContents(p)) + +/* + * Macros for accessing a GIN index page's opaque data + */ +#define GinPageGetOpaque(page) ( (GinPageOpaque) PageGetSpecialPointer(page) ) + +#define GinPageIsLeaf(page) ( (GinPageGetOpaque(page)->flags & GIN_LEAF) != 0 ) +#define GinPageSetLeaf(page) ( GinPageGetOpaque(page)->flags |= GIN_LEAF ) +#define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF ) +#define GinPageIsData(page) ( (GinPageGetOpaque(page)->flags & GIN_DATA) != 0 ) +#define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA ) +#define GinPageIsList(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST) != 0 ) +#define GinPageSetList(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST ) +#define GinPageHasFullRow(page) ( (GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW) != 0 ) +#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW ) +#define GinPageIsCompressed(page) ( (GinPageGetOpaque(page)->flags & GIN_COMPRESSED) != 0 ) +#define GinPageSetCompressed(page) ( GinPageGetOpaque(page)->flags |= GIN_COMPRESSED ) + +#define GinPageIsDeleted(page) ( (GinPageGetOpaque(page)->flags & GIN_DELETED) != 0 ) +#define GinPageSetDeleted(page) ( GinPageGetOpaque(page)->flags |= GIN_DELETED) +#define GinPageSetNonDeleted(page) ( GinPageGetOpaque(page)->flags &= ~GIN_DELETED) +#define GinPageIsIncompleteSplit(page) ( (GinPageGetOpaque(page)->flags & GIN_INCOMPLETE_SPLIT) != 0 ) + +#define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) + +/* + * We use our own ItemPointerGet(BlockNumber|OffsetNumber) + * to avoid Asserts, since sometimes the ip_posid isn't "valid" + */ +#define GinItemPointerGetBlockNumber(pointer) \ + BlockIdGetBlockNumber(&(pointer)->ip_blkid) + +#define GinItemPointerGetOffsetNumber(pointer) \ + ((pointer)->ip_posid) + +/* + * Special-case item pointer values needed by the GIN search logic. + * MIN: sorts less than any valid item pointer + * MAX: sorts greater than any valid item pointer + * LOSSY PAGE: indicates a whole heap page, sorts after normal item + * pointers for that page + * Note that these are all distinguishable from an "invalid" item pointer + * (which is InvalidBlockNumber/0) as well as from all normal item + * pointers (which have item numbers in the range 1..MaxHeapTuplesPerPage). + */ +#define ItemPointerSetMin(p) \ + ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0) +#define ItemPointerIsMin(p) \ + (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0 && \ + GinItemPointerGetBlockNumber(p) == (BlockNumber)0) +#define ItemPointerSetMax(p) \ + ItemPointerSet((p), InvalidBlockNumber, (OffsetNumber)0xffff) +#define ItemPointerIsMax(p) \ + (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ + GinItemPointerGetBlockNumber(p) == InvalidBlockNumber) +#define ItemPointerSetLossyPage(p, b) \ + ItemPointerSet((p), (b), (OffsetNumber)0xffff) +#define ItemPointerIsLossyPage(p) \ + (GinItemPointerGetOffsetNumber(p) == (OffsetNumber)0xffff && \ + GinItemPointerGetBlockNumber(p) != InvalidBlockNumber) + +/* + * Posting item in a non-leaf posting-tree page + */ +typedef struct +{ + /* We use BlockIdData not BlockNumber to avoid padding space wastage */ + BlockIdData child_blkno; + ItemPointerData key; +} PostingItem; + +#define PostingItemGetBlockNumber(pointer) \ + BlockIdGetBlockNumber(&(pointer)->child_blkno) + +#define PostingItemSetBlockNumber(pointer, blockNumber) \ + BlockIdSet(&((pointer)->child_blkno), (blockNumber)) + +/* + * Category codes to distinguish placeholder nulls from ordinary NULL keys. + * Note that the datatype size and the first two code values are chosen to be + * compatible with the usual usage of bool isNull flags. + * + * GIN_CAT_EMPTY_QUERY is never stored in the index; and notice that it is + * chosen to sort before not after regular key values. + */ +typedef signed char GinNullCategory; + +#define GIN_CAT_NORM_KEY 0 /* normal, non-null key value */ +#define GIN_CAT_NULL_KEY 1 /* null key value */ +#define GIN_CAT_EMPTY_ITEM 2 /* placeholder for zero-key item */ +#define GIN_CAT_NULL_ITEM 3 /* placeholder for null item */ +#define GIN_CAT_EMPTY_QUERY (-1) /* placeholder for full-scan query */ + +/* + * Access macros for null category byte in entry tuples + */ +#define GinCategoryOffset(itup,ginstate) \ + (IndexInfoFindDataOffset((itup)->t_info) + \ + ((ginstate)->oneCol ? 0 : sizeof(int16))) +#define GinGetNullCategory(itup,ginstate) \ + (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate)))) +#define GinSetNullCategory(itup,ginstate,c) \ + (*((GinNullCategory *) ((char*)(itup) + GinCategoryOffset(itup,ginstate))) = (c)) + +/* + * Access macros for leaf-page entry tuples (see discussion in README) + */ +#define GinGetNPosting(itup) GinItemPointerGetOffsetNumber(&(itup)->t_tid) +#define GinSetNPosting(itup,n) ItemPointerSetOffsetNumber(&(itup)->t_tid,n) +#define GIN_TREE_POSTING ((OffsetNumber)0xffff) +#define GinIsPostingTree(itup) (GinGetNPosting(itup) == GIN_TREE_POSTING) +#define GinSetPostingTree(itup, blkno) ( GinSetNPosting((itup),GIN_TREE_POSTING), ItemPointerSetBlockNumber(&(itup)->t_tid, blkno) ) +#define GinGetPostingTree(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid) + +#define GIN_ITUP_COMPRESSED (1U << 31) +#define GinGetPostingOffset(itup) (GinItemPointerGetBlockNumber(&(itup)->t_tid) & (~GIN_ITUP_COMPRESSED)) +#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,(n)|GIN_ITUP_COMPRESSED) +#define GinGetPosting(itup) ((Pointer) ((char*)(itup) + GinGetPostingOffset(itup))) +#define GinItupIsCompressed(itup) ((GinItemPointerGetBlockNumber(&(itup)->t_tid) & GIN_ITUP_COMPRESSED) != 0) + +/* + * Maximum size of an item on entry tree page. Make sure that we fit at least + * three items on each page. (On regular B-tree indexes, we must fit at least + * three items: two data items and the "high key". In GIN entry tree, we don't + * currently store the high key explicitly, we just use the rightmost item on + * the page, so it would actually be enough to fit two items.) + */ +#define GinMaxItemSize \ + Min(INDEX_SIZE_MASK, \ + MAXALIGN_DOWN(((BLCKSZ - \ + MAXALIGN(SizeOfPageHeaderData + 3 * sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(GinPageOpaqueData))) / 3))) + +/* + * Access macros for non-leaf entry tuples + */ +#define GinGetDownlink(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid) +#define GinSetDownlink(itup,blkno) ItemPointerSet(&(itup)->t_tid, blkno, InvalidOffsetNumber) + + +/* + * Data (posting tree) pages + * + * Posting tree pages don't store regular tuples. Non-leaf pages contain + * PostingItems, which are pairs of ItemPointers and child block numbers. + * Leaf pages contain GinPostingLists and an uncompressed array of item + * pointers. + * + * In a leaf page, the compressed posting lists are stored after the regular + * page header, one after each other. Although we don't store regular tuples, + * pd_lower is used to indicate the end of the posting lists. After that, free + * space follows. This layout is compatible with the "standard" heap and + * index page layout described in bufpage.h, so that we can e.g set buffer_std + * when writing WAL records. + * + * In the special space is the GinPageOpaque struct. + */ +#define GinDataLeafPageGetPostingList(page) \ + (GinPostingList *) ((PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData)))) +#define GinDataLeafPageGetPostingListSize(page) \ + (((PageHeader) page)->pd_lower - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(ItemPointerData))) + +#define GinDataLeafPageIsEmpty(page) \ + (GinPageIsCompressed(page) ? (GinDataLeafPageGetPostingListSize(page) == 0) : (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber)) + +#define GinDataLeafPageGetFreeSpace(page) PageGetExactFreeSpace(page) + +#define GinDataPageGetRightBound(page) ((ItemPointer) PageGetContents(page)) +/* + * Pointer to the data portion of a posting tree page. For internal pages, + * that's the beginning of the array of PostingItems. For compressed leaf + * pages, the first compressed posting list. For uncompressed (pre-9.4) leaf + * pages, it's the beginning of the ItemPointer array. + */ +#define GinDataPageGetData(page) \ + (PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData))) +/* non-leaf pages contain PostingItems */ +#define GinDataPageGetPostingItem(page, i) \ + ((PostingItem *) (GinDataPageGetData(page) + ((i)-1) * sizeof(PostingItem))) + +/* + * Note: there is no GinDataPageGetDataSize macro, because before version + * 9.4, we didn't set pd_lower on data pages. There can be pages in the index + * that were binary-upgraded from earlier versions and still have an invalid + * pd_lower, so we cannot trust it in general. Compressed posting tree leaf + * pages are new in 9.4, however, so we can trust them; see + * GinDataLeafPageGetPostingListSize. + */ +#define GinDataPageSetDataSize(page, size) \ + { \ + Assert(size <= GinDataPageMaxDataSize); \ + ((PageHeader) page)->pd_lower = (size) + MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(ItemPointerData)); \ + } + +#define GinNonLeafDataPageGetFreeSpace(page) \ + (GinDataPageMaxDataSize - \ + GinPageGetOpaque(page)->maxoff * sizeof(PostingItem)) + +#define GinDataPageMaxDataSize \ + (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + - MAXALIGN(sizeof(ItemPointerData)) \ + - MAXALIGN(sizeof(GinPageOpaqueData))) + +/* + * List pages + */ +#define GinListPageSize \ + ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) + +/* + * A compressed posting list. + * + * Note: This requires 2-byte alignment. + */ +typedef struct +{ + ItemPointerData first; /* first item in this posting list (unpacked) */ + uint16 nbytes; /* number of bytes that follow */ + unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]; /* varbyte encoded items */ +} GinPostingList; + +#define SizeOfGinPostingList(plist) (offsetof(GinPostingList, bytes) + SHORTALIGN((plist)->nbytes) ) +#define GinNextPostingListSegment(cur) ((GinPostingList *) (((char *) (cur)) + SizeOfGinPostingList((cur)))) + +#endif /* GINBLOCK_H */ diff --git a/src/include/access/ginxlog.h b/src/include/access/ginxlog.h new file mode 100644 index 0000000000..8decc42cdb --- /dev/null +++ b/src/include/access/ginxlog.h @@ -0,0 +1,217 @@ +/*-------------------------------------------------------------------------- + * ginxlog.h + * header file for postgres inverted index xlog implementation. + * + * Copyright (c) 2006-2017, PostgreSQL Global Development Group + * + * src/include/access/ginxlog.h + *-------------------------------------------------------------------------- + */ +#ifndef GINXLOG_H +#define GINXLOG_H + +#include "access/ginblock.h" +#include "access/itup.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +#define XLOG_GIN_CREATE_INDEX 0x00 + +#define XLOG_GIN_CREATE_PTREE 0x10 + +typedef struct ginxlogCreatePostingTree +{ + uint32 size; + /* A compressed posting list follows */ +} ginxlogCreatePostingTree; + +/* + * The format of the insertion record varies depending on the page type. + * ginxlogInsert is the common part between all variants. + * + * Backup Blk 0: target page + * Backup Blk 1: left child, if this insertion finishes an incomplete split + */ + +#define XLOG_GIN_INSERT 0x20 + +typedef struct +{ + uint16 flags; /* GIN_INSERT_ISLEAF and/or GIN_INSERT_ISDATA */ + + /* + * FOLLOWS: + * + * 1. if not leaf page, block numbers of the left and right child pages + * whose split this insertion finishes, as BlockIdData[2] (beware of + * adding fields in this struct that would make them not 16-bit aligned) + * + * 2. a ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending + * on tree type. + * + * NB: the below structs are only 16-bit aligned when appended to a + * ginxlogInsert struct! Beware of adding fields to them that require + * stricter alignment. + */ +} ginxlogInsert; + +typedef struct +{ + OffsetNumber offset; + bool isDelete; + IndexTupleData tuple; /* variable length */ +} ginxlogInsertEntry; + + +typedef struct +{ + uint16 nactions; + + /* Variable number of 'actions' follow */ +} ginxlogRecompressDataLeaf; + +/* + * Note: this struct is currently not used in code, and only acts as + * documentation. The WAL record format is as specified here, but the code + * uses straight access through a Pointer and memcpy to read/write these. + */ +typedef struct +{ + uint8 segno; /* segment this action applies to */ + char type; /* action type (see below) */ + + /* + * Action-specific data follows. For INSERT and REPLACE actions that is a + * GinPostingList struct. For ADDITEMS, a uint16 for the number of items + * added, followed by the items themselves as ItemPointers. DELETE actions + * have no further data. + */ +} ginxlogSegmentAction; + +/* Action types */ +#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */ +#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */ +#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */ +#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */ +#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */ + +typedef struct +{ + OffsetNumber offset; + PostingItem newitem; +} ginxlogInsertDataInternal; + +/* + * Backup Blk 0: new left page (= original page, if not root split) + * Backup Blk 1: new right page + * Backup Blk 2: original page / new root page, if root split + * Backup Blk 3: left child, if this insertion completes an earlier split + */ +#define XLOG_GIN_SPLIT 0x30 + +typedef struct ginxlogSplit +{ + RelFileNode node; + BlockNumber rrlink; /* right link, or root's blocknumber if root + * split */ + BlockNumber leftChildBlkno; /* valid on a non-leaf split */ + BlockNumber rightChildBlkno; + uint16 flags; /* see below */ +} ginxlogSplit; + +/* + * Flags used in ginxlogInsert and ginxlogSplit records + */ +#define GIN_INSERT_ISDATA 0x01 /* for both insert and split records */ +#define GIN_INSERT_ISLEAF 0x02 /* ditto */ +#define GIN_SPLIT_ROOT 0x04 /* only for split records */ + +/* + * Vacuum simply WAL-logs the whole page, when anything is modified. This + * is functionally identical to heap_newpage records, but is kept separate for + * debugging purposes. (When inspecting the WAL stream, it's easier to see + * what's going on when GIN vacuum records are marked as such, not as heap + * records.) This is currently only used for entry tree leaf pages. + */ +#define XLOG_GIN_VACUUM_PAGE 0x40 + +/* + * Vacuuming posting tree leaf page is WAL-logged like recompression caused + * by insertion. + */ +#define XLOG_GIN_VACUUM_DATA_LEAF_PAGE 0x90 + +typedef struct ginxlogVacuumDataLeafPage +{ + ginxlogRecompressDataLeaf data; +} ginxlogVacuumDataLeafPage; + +/* + * Backup Blk 0: deleted page + * Backup Blk 1: parent + * Backup Blk 2: left sibling + */ +#define XLOG_GIN_DELETE_PAGE 0x50 + +typedef struct ginxlogDeletePage +{ + OffsetNumber parentOffset; + BlockNumber rightLink; +} ginxlogDeletePage; + +#define XLOG_GIN_UPDATE_META_PAGE 0x60 + +/* + * Backup Blk 0: metapage + * Backup Blk 1: tail page + */ +typedef struct ginxlogUpdateMeta +{ + RelFileNode node; + GinMetaPageData metadata; + BlockNumber prevTail; + BlockNumber newRightlink; + int32 ntuples; /* if ntuples > 0 then metadata.tail was + * updated with that many tuples; else new sub + * list was inserted */ + /* array of inserted tuples follows */ +} ginxlogUpdateMeta; + +#define XLOG_GIN_INSERT_LISTPAGE 0x70 + +typedef struct ginxlogInsertListPage +{ + BlockNumber rightlink; + int32 ntuples; + /* array of inserted tuples follows */ +} ginxlogInsertListPage; + +/* + * Backup Blk 0: metapage + * Backup Blk 1 to (ndeleted + 1): deleted pages + */ + +#define XLOG_GIN_DELETE_LISTPAGE 0x80 + +/* + * The WAL record for deleting list pages must contain a block reference to + * all the deleted pages, so the number of pages that can be deleted in one + * record is limited by XLR_MAX_BLOCK_ID. (block_id 0 is used for the + * metapage.) + */ +#define GIN_NDELETE_AT_ONCE Min(16, XLR_MAX_BLOCK_ID - 1) +typedef struct ginxlogDeleteListPages +{ + GinMetaPageData metadata; + int32 ndeleted; +} ginxlogDeleteListPages; + +extern void gin_redo(XLogReaderState *record); +extern void gin_desc(StringInfo buf, XLogReaderState *record); +extern const char *gin_identify(uint8 info); +extern void gin_xlog_startup(void); +extern void gin_xlog_cleanup(void); +extern void gin_mask(char *pagedata, BlockNumber blkno); + +#endif /* GINXLOG_H */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 873c52a1a2..5b3303056b 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -17,7 +17,6 @@ #include "access/amapi.h" #include "access/gist.h" #include "access/itup.h" -#include "access/xlogreader.h" #include "fmgr.h" #include "lib/pairingheap.h" #include "storage/bufmgr.h" @@ -177,51 +176,7 @@ typedef struct GISTScanOpaqueData typedef GISTScanOpaqueData *GISTScanOpaque; - -/* XLog stuff */ - -#define XLOG_GIST_PAGE_UPDATE 0x00 - /* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */ -#define XLOG_GIST_PAGE_SPLIT 0x30 - /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ -#define XLOG_GIST_CREATE_INDEX 0x50 - /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ - -/* - * Backup Blk 0: updated page. - * Backup Blk 1: If this operation completes a page split, by inserting a - * downlink for the split page, the left half of the split - */ -typedef struct gistxlogPageUpdate -{ - /* number of deleted offsets */ - uint16 ntodelete; - uint16 ntoinsert; - - /* - * In payload of blk 0 : 1. todelete OffsetNumbers 2. tuples to insert - */ -} gistxlogPageUpdate; - -/* - * Backup Blk 0: If this operation completes a page split, by inserting a - * downlink for the split page, the left half of the split - * Backup Blk 1 - npage: split pages (1 is the original page) - */ -typedef struct gistxlogPageSplit -{ - BlockNumber origrlink; /* rightlink of the page before split */ - GistNSN orignsn; /* NSN of the page before split */ - bool origleaf; /* was splitted page a leaf page? */ - - uint16 npage; /* # of pages in the split */ - bool markfollowright; /* set F_FOLLOW_RIGHT flags */ - - /* - * follow: 1. gistxlogPage and array of IndexTupleData per page - */ -} gistxlogPageSplit; - +/* despite the name, gistxlogPage is not part of any xlog record */ typedef struct gistxlogPage { BlockNumber blkno; @@ -454,14 +409,6 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); -/* gistxlog.c */ -extern void gist_redo(XLogReaderState *record); -extern void gist_desc(StringInfo buf, XLogReaderState *record); -extern const char *gist_identify(uint8 info); -extern void gist_xlog_startup(void); -extern void gist_xlog_cleanup(void); -extern void gist_mask(char *pagedata, BlockNumber blkno); - extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ntup, diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h new file mode 100644 index 0000000000..3b126eca2a --- /dev/null +++ b/src/include/access/gistxlog.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * gistxlog.h + * gist xlog routines + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/gistxlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef GIST_XLOG_H +#define GIST_XLOG_H + +#include "access/gist.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + +#define XLOG_GIST_PAGE_UPDATE 0x00 + /* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */ +#define XLOG_GIST_PAGE_SPLIT 0x30 + /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ +#define XLOG_GIST_CREATE_INDEX 0x50 + /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ + +/* + * Backup Blk 0: updated page. + * Backup Blk 1: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + */ +typedef struct gistxlogPageUpdate +{ + /* number of deleted offsets */ + uint16 ntodelete; + uint16 ntoinsert; + + /* + * In payload of blk 0 : 1. todelete OffsetNumbers 2. tuples to insert + */ +} gistxlogPageUpdate; + +/* + * Backup Blk 0: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + * Backup Blk 1 - npage: split pages (1 is the original page) + */ +typedef struct gistxlogPageSplit +{ + BlockNumber origrlink; /* rightlink of the page before split */ + GistNSN orignsn; /* NSN of the page before split */ + bool origleaf; /* was splitted page a leaf page? */ + + uint16 npage; /* # of pages in the split */ + bool markfollowright; /* set F_FOLLOW_RIGHT flags */ + + /* + * follow: 1. gistxlogPage and array of IndexTupleData per page + */ +} gistxlogPageSplit; + +extern void gist_redo(XLogReaderState *record); +extern void gist_desc(StringInfo buf, XLogReaderState *record); +extern const char *gist_identify(uint8 info); +extern void gist_xlog_startup(void); +extern void gist_xlog_cleanup(void); +extern void gist_mask(char *pagedata, BlockNumber blkno); + +#endif diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index ed3c37f5db..cc231632e1 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -14,8 +14,8 @@ #ifndef HASH_XLOG_H #define HASH_XLOG_H -#include "access/hash.h" #include "access/xlogreader.h" +#include "lib/stringinfo.h" extern void hash_redo(XLogReaderState *record); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 344ef9933c..25a1dc818c 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -204,232 +204,6 @@ typedef struct BTMetaPageData #define P_FIRSTKEY ((OffsetNumber) 2) #define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) -/* - * XLOG records for btree operations - * - * XLOG allows to store some information in high 4 bits of log - * record xl_info field - */ -#define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ -#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ -#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ -#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ -#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */ -#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */ -#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ -#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ -#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ -#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ -#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */ -#define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during - * vacuum */ -#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from - * FSM */ - -/* - * All that we need to regenerate the meta-data page - */ -typedef struct xl_btree_metadata -{ - BlockNumber root; - uint32 level; - BlockNumber fastroot; - uint32 fastlevel; -} xl_btree_metadata; - -/* - * This is what we need to know about simple (without split) insert. - * - * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. - * Note that INSERT_META implies it's not a leaf page. - * - * Backup Blk 0: original page (data contains the inserted tuple) - * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META - * Backup Blk 2: xl_btree_metadata, if INSERT_META - */ -typedef struct xl_btree_insert -{ - OffsetNumber offnum; -} xl_btree_insert; - -#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) - -/* - * On insert with split, we save all the items going into the right sibling - * so that we can restore it completely from the log record. This way takes - * less xlog space than the normal approach, because if we did it standardly, - * XLogInsert would almost always think the right page is new and store its - * whole page image. The left page, however, is handled in the normal - * incremental-update fashion. - * - * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. - * The _L and _R variants indicate whether the inserted tuple went into the - * left or right split page (and thus, whether newitemoff and the new item - * are stored or not). The _ROOT variants indicate that we are splitting - * the root page, and thus that a newroot record rather than an insert or - * split record should follow. Note that a split record never carries a - * metapage update --- we'll do that in the parent-level update. - * - * Backup Blk 0: original page / new left page - * - * The left page's data portion contains the new item, if it's the _L variant. - * (In the _R variants, the new item is one of the right page's tuples.) - * If level > 0, an IndexTuple representing the HIKEY of the left page - * follows. We don't need this on leaf pages, because it's the same as the - * leftmost key in the new right page. - * - * Backup Blk 1: new right page - * - * The right page's data portion contains the right page's tuples in the - * form used by _bt_restore_page. - * - * Backup Blk 2: next block (orig page's rightlink), if any - * Backup Blk 3: child's left sibling, if non-leaf split - */ -typedef struct xl_btree_split -{ - uint32 level; /* tree level of page being split */ - OffsetNumber firstright; /* first item moved to right page */ - OffsetNumber newitemoff; /* new item's offset (if placed on left page) */ -} xl_btree_split; - -#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) - -/* - * This is what we need to know about delete of individual leaf index tuples. - * The WAL record can represent deletion of any number of index tuples on a - * single index page when *not* executed by VACUUM. - * - * Backup Blk 0: index page - */ -typedef struct xl_btree_delete -{ - RelFileNode hnode; /* RelFileNode of the heap the index currently - * points at */ - int nitems; - - /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ -} xl_btree_delete; - -#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int)) - -/* - * This is what we need to know about page reuse within btree. - */ -typedef struct xl_btree_reuse_page -{ - RelFileNode node; - BlockNumber block; - TransactionId latestRemovedXid; -} xl_btree_reuse_page; - -#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) - -/* - * This is what we need to know about vacuum of individual leaf index tuples. - * The WAL record can represent deletion of any number of index tuples on a - * single index page when executed by VACUUM. - * - * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. - * For a non-MVCC index scans there is an additional correctness requirement - * for applying these changes during recovery, which is that we must do one - * of these two things for every block in the index: - * * lock the block for cleanup and apply any required changes - * * EnsureBlockUnpinned() - * The purpose of this is to ensure that no index scans started before we - * finish scanning the index are still running by the time we begin to remove - * heap tuples. - * - * Any changes to any one block are registered on just one WAL record. All - * blocks that we need to run EnsureBlockUnpinned() are listed as a block range - * starting from the last block vacuumed through until this one. Individual - * block numbers aren't given. - * - * Note that the *last* WAL record in any vacuum of an index is allowed to - * have a zero length array of offsets. Earlier records must have at least one. - */ -typedef struct xl_btree_vacuum -{ - BlockNumber lastBlockVacuumed; - - /* TARGET OFFSET NUMBERS FOLLOW */ -} xl_btree_vacuum; - -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) - -/* - * This is what we need to know about marking an empty branch for deletion. - * The target identifies the tuple removed from the parent page (note that we - * remove this tuple's downlink and the *following* tuple's key). Note that - * the leaf page is empty, so we don't need to store its content --- it is - * just reinitialized during recovery using the rest of the fields. - * - * Backup Blk 0: leaf block - * Backup Blk 1: top parent - */ -typedef struct xl_btree_mark_page_halfdead -{ - OffsetNumber poffset; /* deleted tuple id in parent page */ - - /* information needed to recreate the leaf page: */ - BlockNumber leafblk; /* leaf block ultimately being deleted */ - BlockNumber leftblk; /* leaf block's left sibling, if any */ - BlockNumber rightblk; /* leaf block's right sibling */ - BlockNumber topparent; /* topmost internal page in the branch */ -} xl_btree_mark_page_halfdead; - -#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) - -/* - * This is what we need to know about deletion of a btree page. Note we do - * not store any content for the deleted page --- it is just rewritten as empty - * during recovery, apart from resetting the btpo.xact. - * - * Backup Blk 0: target block being deleted - * Backup Blk 1: target block's left sibling, if any - * Backup Blk 2: target block's right sibling - * Backup Blk 3: leaf block (if different from target) - * Backup Blk 4: metapage (if rightsib becomes new fast root) - */ -typedef struct xl_btree_unlink_page -{ - BlockNumber leftsib; /* target block's left sibling, if any */ - BlockNumber rightsib; /* target block's right sibling */ - - /* - * Information needed to recreate the leaf page, when target is an - * internal page. - */ - BlockNumber leafleftsib; - BlockNumber leafrightsib; - BlockNumber topparent; /* next child down in the branch */ - - TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ - /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ -} xl_btree_unlink_page; - -#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) - -/* - * New root log record. There are zero tuples if this is to establish an - * empty root, or two if it is the result of splitting an old root. - * - * Note that although this implies rewriting the metadata page, we don't need - * an xl_btree_metadata record --- the rootblk and level are sufficient. - * - * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) - * Backup Blk 1: left child (if splitting an old root) - * Backup Blk 2: metapage - */ -typedef struct xl_btree_newroot -{ - BlockNumber rootblk; /* location of new root (redundant with blk 0) */ - uint32 level; /* its tree level */ -} xl_btree_newroot; - -#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) - /* * Operator strategy numbers for B-tree have been moved to access/stratnum.h, @@ -769,12 +543,4 @@ extern void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull); extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); -/* - * prototypes for functions in nbtxlog.c - */ -extern void btree_redo(XLogReaderState *record); -extern void btree_desc(StringInfo buf, XLogReaderState *record); -extern const char *btree_identify(uint8 info); -extern void btree_mask(char *pagedata, BlockNumber blkno); - #endif /* NBTREE_H */ diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h new file mode 100644 index 0000000000..d6a3085923 --- /dev/null +++ b/src/include/access/nbtxlog.h @@ -0,0 +1,255 @@ +/*------------------------------------------------------------------------- + * + * nbtxlog.h + * header file for postgres btree xlog routines + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/nbtxlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef NBTXLOG_H +#define NBTXLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +/* + * XLOG records for btree operations + * + * XLOG allows to store some information in high 4 bits of log + * record xl_info field + */ +#define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ +#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ +#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ +#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ +#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ +#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */ +#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */ +#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ +#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ +#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ +#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ +#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */ +#define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during + * vacuum */ +#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from + * FSM */ + +/* + * All that we need to regenerate the meta-data page + */ +typedef struct xl_btree_metadata +{ + BlockNumber root; + uint32 level; + BlockNumber fastroot; + uint32 fastlevel; +} xl_btree_metadata; + +/* + * This is what we need to know about simple (without split) insert. + * + * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. + * Note that INSERT_META implies it's not a leaf page. + * + * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META + * Backup Blk 2: xl_btree_metadata, if INSERT_META + */ +typedef struct xl_btree_insert +{ + OffsetNumber offnum; +} xl_btree_insert; + +#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) + +/* + * On insert with split, we save all the items going into the right sibling + * so that we can restore it completely from the log record. This way takes + * less xlog space than the normal approach, because if we did it standardly, + * XLogInsert would almost always think the right page is new and store its + * whole page image. The left page, however, is handled in the normal + * incremental-update fashion. + * + * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. + * The _L and _R variants indicate whether the inserted tuple went into the + * left or right split page (and thus, whether newitemoff and the new item + * are stored or not). The _ROOT variants indicate that we are splitting + * the root page, and thus that a newroot record rather than an insert or + * split record should follow. Note that a split record never carries a + * metapage update --- we'll do that in the parent-level update. + * + * Backup Blk 0: original page / new left page + * + * The left page's data portion contains the new item, if it's the _L variant. + * (In the _R variants, the new item is one of the right page's tuples.) + * If level > 0, an IndexTuple representing the HIKEY of the left page + * follows. We don't need this on leaf pages, because it's the same as the + * leftmost key in the new right page. + * + * Backup Blk 1: new right page + * + * The right page's data portion contains the right page's tuples in the + * form used by _bt_restore_page. + * + * Backup Blk 2: next block (orig page's rightlink), if any + * Backup Blk 3: child's left sibling, if non-leaf split + */ +typedef struct xl_btree_split +{ + uint32 level; /* tree level of page being split */ + OffsetNumber firstright; /* first item moved to right page */ + OffsetNumber newitemoff; /* new item's offset (if placed on left page) */ +} xl_btree_split; + +#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) + +/* + * This is what we need to know about delete of individual leaf index tuples. + * The WAL record can represent deletion of any number of index tuples on a + * single index page when *not* executed by VACUUM. + * + * Backup Blk 0: index page + */ +typedef struct xl_btree_delete +{ + RelFileNode hnode; /* RelFileNode of the heap the index currently + * points at */ + int nitems; + + /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ +} xl_btree_delete; + +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int)) + +/* + * This is what we need to know about page reuse within btree. + */ +typedef struct xl_btree_reuse_page +{ + RelFileNode node; + BlockNumber block; + TransactionId latestRemovedXid; +} xl_btree_reuse_page; + +#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) + +/* + * This is what we need to know about vacuum of individual leaf index tuples. + * The WAL record can represent deletion of any number of index tuples on a + * single index page when executed by VACUUM. + * + * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. + * For a non-MVCC index scans there is an additional correctness requirement + * for applying these changes during recovery, which is that we must do one + * of these two things for every block in the index: + * * lock the block for cleanup and apply any required changes + * * EnsureBlockUnpinned() + * The purpose of this is to ensure that no index scans started before we + * finish scanning the index are still running by the time we begin to remove + * heap tuples. + * + * Any changes to any one block are registered on just one WAL record. All + * blocks that we need to run EnsureBlockUnpinned() are listed as a block range + * starting from the last block vacuumed through until this one. Individual + * block numbers aren't given. + * + * Note that the *last* WAL record in any vacuum of an index is allowed to + * have a zero length array of offsets. Earlier records must have at least one. + */ +typedef struct xl_btree_vacuum +{ + BlockNumber lastBlockVacuumed; + + /* TARGET OFFSET NUMBERS FOLLOW */ +} xl_btree_vacuum; + +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) + +/* + * This is what we need to know about marking an empty branch for deletion. + * The target identifies the tuple removed from the parent page (note that we + * remove this tuple's downlink and the *following* tuple's key). Note that + * the leaf page is empty, so we don't need to store its content --- it is + * just reinitialized during recovery using the rest of the fields. + * + * Backup Blk 0: leaf block + * Backup Blk 1: top parent + */ +typedef struct xl_btree_mark_page_halfdead +{ + OffsetNumber poffset; /* deleted tuple id in parent page */ + + /* information needed to recreate the leaf page: */ + BlockNumber leafblk; /* leaf block ultimately being deleted */ + BlockNumber leftblk; /* leaf block's left sibling, if any */ + BlockNumber rightblk; /* leaf block's right sibling */ + BlockNumber topparent; /* topmost internal page in the branch */ +} xl_btree_mark_page_halfdead; + +#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) + +/* + * This is what we need to know about deletion of a btree page. Note we do + * not store any content for the deleted page --- it is just rewritten as empty + * during recovery, apart from resetting the btpo.xact. + * + * Backup Blk 0: target block being deleted + * Backup Blk 1: target block's left sibling, if any + * Backup Blk 2: target block's right sibling + * Backup Blk 3: leaf block (if different from target) + * Backup Blk 4: metapage (if rightsib becomes new fast root) + */ +typedef struct xl_btree_unlink_page +{ + BlockNumber leftsib; /* target block's left sibling, if any */ + BlockNumber rightsib; /* target block's right sibling */ + + /* + * Information needed to recreate the leaf page, when target is an + * internal page. + */ + BlockNumber leafleftsib; + BlockNumber leafrightsib; + BlockNumber topparent; /* next child down in the branch */ + + TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ + /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ +} xl_btree_unlink_page; + +#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) + +/* + * New root log record. There are zero tuples if this is to establish an + * empty root, or two if it is the result of splitting an old root. + * + * Note that although this implies rewriting the metadata page, we don't need + * an xl_btree_metadata record --- the rootblk and level are sufficient. + * + * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) + * Backup Blk 1: left child (if splitting an old root) + * Backup Blk 2: metapage + */ +typedef struct xl_btree_newroot +{ + BlockNumber rootblk; /* location of new root (redundant with blk 0) */ + uint32 level; /* its tree level */ +} xl_btree_newroot; + +#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) + + +/* + * prototypes for functions in nbtxlog.c + */ +extern void btree_redo(XLogReaderState *record); +extern void btree_desc(StringInfo buf, XLogReaderState *record); +extern const char *btree_identify(uint8 info); +extern void btree_mask(char *pagedata, BlockNumber blkno); + +#endif /* NBXLOG_H */ diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index ee7480ad39..9dca8fde7d 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -214,12 +214,4 @@ extern IndexBulkDeleteResult *spgvacuumcleanup(IndexVacuumInfo *info, /* spgvalidate.c */ extern bool spgvalidate(Oid opclassoid); -/* spgxlog.c */ -extern void spg_redo(XLogReaderState *record); -extern void spg_desc(StringInfo buf, XLogReaderState *record); -extern const char *spg_identify(uint8 info); -extern void spg_xlog_startup(void); -extern void spg_xlog_cleanup(void); -extern void spg_mask(char *pagedata, BlockNumber blkno); - #endif /* SPGIST_H */ diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index b2979a9d43..e42079b09f 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -354,242 +354,12 @@ typedef SpGistDeadTupleData *SpGistDeadTuple; * XLOG stuff */ -/* XLOG record types for SPGiST */ -#define XLOG_SPGIST_CREATE_INDEX 0x00 -#define XLOG_SPGIST_ADD_LEAF 0x10 -#define XLOG_SPGIST_MOVE_LEAFS 0x20 -#define XLOG_SPGIST_ADD_NODE 0x30 -#define XLOG_SPGIST_SPLIT_TUPLE 0x40 -#define XLOG_SPGIST_PICKSPLIT 0x50 -#define XLOG_SPGIST_VACUUM_LEAF 0x60 -#define XLOG_SPGIST_VACUUM_ROOT 0x70 -#define XLOG_SPGIST_VACUUM_REDIRECT 0x80 - -/* - * Some redo functions need an SpGistState, although only a few of its fields - * need to be valid. spgxlogState carries the required info in xlog records. - * (See fillFakeState in spgxlog.c for more comments.) - */ -typedef struct spgxlogState -{ - TransactionId myXid; - bool isBuild; -} spgxlogState; - #define STORE_STATE(s, d) \ do { \ (d).myXid = (s)->myXid; \ (d).isBuild = (s)->isBuild; \ } while(0) -/* - * Backup Blk 0: destination page for leaf tuple - * Backup Blk 1: parent page (if any) - */ -typedef struct spgxlogAddLeaf -{ - bool newPage; /* init dest page? */ - bool storesNulls; /* page is in the nulls tree? */ - OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */ - OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */ - - OffsetNumber offnumParent; /* where the parent downlink is, if any */ - uint16 nodeI; - - /* new leaf tuple follows (unaligned!) */ -} spgxlogAddLeaf; - -/* - * Backup Blk 0: source leaf page - * Backup Blk 1: destination leaf page - * Backup Blk 2: parent page - */ -typedef struct spgxlogMoveLeafs -{ - uint16 nMoves; /* number of tuples moved from source page */ - bool newPage; /* init dest page? */ - bool replaceDead; /* are we replacing a DEAD source tuple? */ - bool storesNulls; /* pages are in the nulls tree? */ - - /* where the parent downlink is */ - OffsetNumber offnumParent; - uint16 nodeI; - - spgxlogState stateSrc; - - /*---------- - * data follows: - * array of deleted tuple numbers, length nMoves - * array of inserted tuple numbers, length nMoves + 1 or 1 - * list of leaf tuples, length nMoves + 1 or 1 (unaligned!) - * - * Note: if replaceDead is true then there is only one inserted tuple - * number and only one leaf tuple in the data, because we are not copying - * the dead tuple from the source - *---------- - */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; -} spgxlogMoveLeafs; - -#define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets) - -/* - * Backup Blk 0: original page - * Backup Blk 1: where new tuple goes, if not same place - * Backup Blk 2: where parent downlink is, if updated and different from - * the old and new - */ -typedef struct spgxlogAddNode -{ - /* - * Offset of the original inner tuple, in the original page (on backup - * block 0). - */ - OffsetNumber offnum; - - /* - * Offset of the new tuple, on the new page (on backup block 1). Invalid, - * if we overwrote the old tuple in the original page). - */ - OffsetNumber offnumNew; - bool newPage; /* init new page? */ - - /*---- - * Where is the parent downlink? parentBlk indicates which page it's on, - * and offnumParent is the offset within the page. The possible values for - * parentBlk are: - * - * 0: parent == original page - * 1: parent == new page - * 2: parent == different page (blk ref 2) - * -1: parent not updated - *---- - */ - int8 parentBlk; - OffsetNumber offnumParent; /* offset within the parent page */ - - uint16 nodeI; - - spgxlogState stateSrc; - - /* - * updated inner tuple follows (unaligned!) - */ -} spgxlogAddNode; - -/* - * Backup Blk 0: where the prefix tuple goes - * Backup Blk 1: where the postfix tuple goes (if different page) - */ -typedef struct spgxlogSplitTuple -{ - /* where the prefix tuple goes */ - OffsetNumber offnumPrefix; - - /* where the postfix tuple goes */ - OffsetNumber offnumPostfix; - bool newPage; /* need to init that page? */ - bool postfixBlkSame; /* was postfix tuple put on same page as - * prefix? */ - - /* - * new prefix inner tuple follows, then new postfix inner tuple (both are - * unaligned!) - */ -} spgxlogSplitTuple; - -/* - * Buffer references in the rdata array are: - * Backup Blk 0: Src page (only if not root) - * Backup Blk 1: Dest page (if used) - * Backup Blk 2: Inner page - * Backup Blk 3: Parent page (if any, and different from Inner) - */ -typedef struct spgxlogPickSplit -{ - bool isRootSplit; - - uint16 nDelete; /* n to delete from Src */ - uint16 nInsert; /* n to insert on Src and/or Dest */ - bool initSrc; /* re-init the Src page? */ - bool initDest; /* re-init the Dest page? */ - - /* where to put new inner tuple */ - OffsetNumber offnumInner; - bool initInner; /* re-init the Inner page? */ - - bool storesNulls; /* pages are in the nulls tree? */ - - /* where the parent downlink is, if any */ - bool innerIsParent; /* is parent the same as inner page? */ - OffsetNumber offnumParent; - uint16 nodeI; - - spgxlogState stateSrc; - - /*---------- - * data follows: - * array of deleted tuple numbers, length nDelete - * array of inserted tuple numbers, length nInsert - * array of page selector bytes for inserted tuples, length nInsert - * new inner tuple (unaligned!) - * list of leaf tuples, length nInsert (unaligned!) - *---------- - */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; -} spgxlogPickSplit; - -#define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets) - -typedef struct spgxlogVacuumLeaf -{ - uint16 nDead; /* number of tuples to become DEAD */ - uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */ - uint16 nMove; /* number of tuples to move */ - uint16 nChain; /* number of tuples to re-chain */ - - spgxlogState stateSrc; - - /*---------- - * data follows: - * tuple numbers to become DEAD - * tuple numbers to become PLACEHOLDER - * tuple numbers to move from (and replace with PLACEHOLDER) - * tuple numbers to move to (replacing what is there) - * tuple numbers to update nextOffset links of - * tuple numbers to insert in nextOffset links - *---------- - */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; -} spgxlogVacuumLeaf; - -#define SizeOfSpgxlogVacuumLeaf offsetof(spgxlogVacuumLeaf, offsets) - -typedef struct spgxlogVacuumRoot -{ - /* vacuum a root page when it is also a leaf */ - uint16 nDelete; /* number of tuples to delete */ - - spgxlogState stateSrc; - - /* offsets of tuples to delete follow */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; -} spgxlogVacuumRoot; - -#define SizeOfSpgxlogVacuumRoot offsetof(spgxlogVacuumRoot, offsets) - -typedef struct spgxlogVacuumRedirect -{ - uint16 nToPlaceholder; /* number of redirects to make placeholders */ - OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */ - TransactionId newestRedirectXid; /* newest XID of removed redirects */ - - /* offsets of redirect tuples to make placeholders follow */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; -} spgxlogVacuumRedirect; - -#define SizeOfSpgxlogVacuumRedirect offsetof(spgxlogVacuumRedirect, offsets) - /* * The "flags" argument for SpGistGetBuffer should be either GBUF_LEAF to * get a leaf page, or GBUF_INNER_PARITY(blockNumber) to get an inner diff --git a/src/include/access/spgxlog.h b/src/include/access/spgxlog.h new file mode 100644 index 0000000000..ff597f75db --- /dev/null +++ b/src/include/access/spgxlog.h @@ -0,0 +1,257 @@ +/*------------------------------------------------------------------------- + * + * spgxlog.h + * xlog declarations for SP-GiST access method. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/spgxlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef SPGXLOG_H +#define SPGXLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +/* XLOG record types for SPGiST */ +#define XLOG_SPGIST_CREATE_INDEX 0x00 +#define XLOG_SPGIST_ADD_LEAF 0x10 +#define XLOG_SPGIST_MOVE_LEAFS 0x20 +#define XLOG_SPGIST_ADD_NODE 0x30 +#define XLOG_SPGIST_SPLIT_TUPLE 0x40 +#define XLOG_SPGIST_PICKSPLIT 0x50 +#define XLOG_SPGIST_VACUUM_LEAF 0x60 +#define XLOG_SPGIST_VACUUM_ROOT 0x70 +#define XLOG_SPGIST_VACUUM_REDIRECT 0x80 + +/* + * Some redo functions need an SpGistState, although only a few of its fields + * need to be valid. spgxlogState carries the required info in xlog records. + * (See fillFakeState in spgxlog.c for more comments.) + */ +typedef struct spgxlogState +{ + TransactionId myXid; + bool isBuild; +} spgxlogState; + +/* + * Backup Blk 0: destination page for leaf tuple + * Backup Blk 1: parent page (if any) + */ +typedef struct spgxlogAddLeaf +{ + bool newPage; /* init dest page? */ + bool storesNulls; /* page is in the nulls tree? */ + OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */ + OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */ + + OffsetNumber offnumParent; /* where the parent downlink is, if any */ + uint16 nodeI; + + /* new leaf tuple follows (unaligned!) */ +} spgxlogAddLeaf; + +/* + * Backup Blk 0: source leaf page + * Backup Blk 1: destination leaf page + * Backup Blk 2: parent page + */ +typedef struct spgxlogMoveLeafs +{ + uint16 nMoves; /* number of tuples moved from source page */ + bool newPage; /* init dest page? */ + bool replaceDead; /* are we replacing a DEAD source tuple? */ + bool storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nMoves + * array of inserted tuple numbers, length nMoves + 1 or 1 + * list of leaf tuples, length nMoves + 1 or 1 (unaligned!) + * + * Note: if replaceDead is true then there is only one inserted tuple + * number and only one leaf tuple in the data, because we are not copying + * the dead tuple from the source + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogMoveLeafs; + +#define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets) + +/* + * Backup Blk 0: original page + * Backup Blk 1: where new tuple goes, if not same place + * Backup Blk 2: where parent downlink is, if updated and different from + * the old and new + */ +typedef struct spgxlogAddNode +{ + /* + * Offset of the original inner tuple, in the original page (on backup + * block 0). + */ + OffsetNumber offnum; + + /* + * Offset of the new tuple, on the new page (on backup block 1). Invalid, + * if we overwrote the old tuple in the original page). + */ + OffsetNumber offnumNew; + bool newPage; /* init new page? */ + + /*---- + * Where is the parent downlink? parentBlk indicates which page it's on, + * and offnumParent is the offset within the page. The possible values for + * parentBlk are: + * + * 0: parent == original page + * 1: parent == new page + * 2: parent == different page (blk ref 2) + * -1: parent not updated + *---- + */ + int8 parentBlk; + OffsetNumber offnumParent; /* offset within the parent page */ + + uint16 nodeI; + + spgxlogState stateSrc; + + /* + * updated inner tuple follows (unaligned!) + */ +} spgxlogAddNode; + +/* + * Backup Blk 0: where the prefix tuple goes + * Backup Blk 1: where the postfix tuple goes (if different page) + */ +typedef struct spgxlogSplitTuple +{ + /* where the prefix tuple goes */ + OffsetNumber offnumPrefix; + + /* where the postfix tuple goes */ + OffsetNumber offnumPostfix; + bool newPage; /* need to init that page? */ + bool postfixBlkSame; /* was postfix tuple put on same page as + * prefix? */ + + /* + * new prefix inner tuple follows, then new postfix inner tuple (both are + * unaligned!) + */ +} spgxlogSplitTuple; + +/* + * Buffer references in the rdata array are: + * Backup Blk 0: Src page (only if not root) + * Backup Blk 1: Dest page (if used) + * Backup Blk 2: Inner page + * Backup Blk 3: Parent page (if any, and different from Inner) + */ +typedef struct spgxlogPickSplit +{ + bool isRootSplit; + + uint16 nDelete; /* n to delete from Src */ + uint16 nInsert; /* n to insert on Src and/or Dest */ + bool initSrc; /* re-init the Src page? */ + bool initDest; /* re-init the Dest page? */ + + /* where to put new inner tuple */ + OffsetNumber offnumInner; + bool initInner; /* re-init the Inner page? */ + + bool storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is, if any */ + bool innerIsParent; /* is parent the same as inner page? */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nDelete + * array of inserted tuple numbers, length nInsert + * array of page selector bytes for inserted tuples, length nInsert + * new inner tuple (unaligned!) + * list of leaf tuples, length nInsert (unaligned!) + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogPickSplit; + +#define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets) + +typedef struct spgxlogVacuumLeaf +{ + uint16 nDead; /* number of tuples to become DEAD */ + uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */ + uint16 nMove; /* number of tuples to move */ + uint16 nChain; /* number of tuples to re-chain */ + + spgxlogState stateSrc; + + /*---------- + * data follows: + * tuple numbers to become DEAD + * tuple numbers to become PLACEHOLDER + * tuple numbers to move from (and replace with PLACEHOLDER) + * tuple numbers to move to (replacing what is there) + * tuple numbers to update nextOffset links of + * tuple numbers to insert in nextOffset links + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogVacuumLeaf; + +#define SizeOfSpgxlogVacuumLeaf offsetof(spgxlogVacuumLeaf, offsets) + +typedef struct spgxlogVacuumRoot +{ + /* vacuum a root page when it is also a leaf */ + uint16 nDelete; /* number of tuples to delete */ + + spgxlogState stateSrc; + + /* offsets of tuples to delete follow */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogVacuumRoot; + +#define SizeOfSpgxlogVacuumRoot offsetof(spgxlogVacuumRoot, offsets) + +typedef struct spgxlogVacuumRedirect +{ + uint16 nToPlaceholder; /* number of redirects to make placeholders */ + OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */ + TransactionId newestRedirectXid; /* newest XID of removed redirects */ + + /* offsets of redirect tuples to make placeholders follow */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgxlogVacuumRedirect; + +#define SizeOfSpgxlogVacuumRedirect offsetof(spgxlogVacuumRedirect, offsets) + +extern void spg_redo(XLogReaderState *record); +extern void spg_desc(StringInfo buf, XLogReaderState *record); +extern const char *spg_identify(uint8 info); +extern void spg_xlog_startup(void); +extern void spg_xlog_cleanup(void); +extern void spg_mask(char *pagedata, BlockNumber blkno); + +#endif /* SPGXLOG_H */ -- 2.40.0