1 /*-------------------------------------------------------------------------
4 * private declarations for GiST -- declarations related to the
5 * internal implementation of GiST, not the public API
7 * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/include/access/gist_private.h
12 *-------------------------------------------------------------------------
14 #ifndef GIST_PRIVATE_H
15 #define GIST_PRIVATE_H
17 #include "access/gist.h"
18 #include "access/itup.h"
19 #include "access/xlogreader.h"
21 #include "lib/pairingheap.h"
22 #include "storage/bufmgr.h"
23 #include "storage/buffile.h"
24 #include "utils/hsearch.h"
27 * Maximum number of "halves" a page can be split into in one operation.
28 * Typically a split produces 2 halves, but can be more if keys have very
29 * different lengths, or when inserting multiple keys in one operation (as
30 * when inserting downlinks to an internal node). There is no theoretical
31 * limit on this, but in practice if you get more than a handful page halves
32 * in one split, there's something wrong with the opclass implementation.
33 * GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some
34 * local arrays used during split. Note that there is also a limit on the
35 * number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS,
36 * so if you raise this higher than that limit, you'll just get a different
39 #define GIST_MAX_SPLIT_PAGES 75
41 /* Buffer lock modes */
42 #define GIST_SHARE BUFFER_LOCK_SHARE
43 #define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
44 #define GIST_UNLOCK BUFFER_LOCK_UNLOCK
50 char tupledata[FLEXIBLE_ARRAY_MEMBER];
53 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
54 /* Returns free space in node buffer page */
55 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
56 /* Checks if node buffer page is empty */
57 #define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
58 /* Checks if node buffers page don't contain sufficient space for index tuple */
59 #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
60 MAXALIGN(IndexTupleSize(itup)))
63 * GISTSTATE: information needed for any GiST index operation
65 * This struct retains call info for the index's opclass-specific support
66 * functions (per index column), plus the index's tuple descriptor.
68 * scanCxt holds the GISTSTATE itself as well as any data that lives for the
69 * lifetime of the index operation. We pass this to the support functions
70 * via fn_mcxt, so that they can store scan-lifespan data in it. The
71 * functions are invoked in tempCxt, which is typically short-lifespan
72 * (that is, it's reset after each tuple). However, tempCxt can be the same
73 * as scanCxt if we're not bothering with per-tuple context resets.
75 typedef struct GISTSTATE
77 MemoryContext scanCxt; /* context for scan-lifespan data */
78 MemoryContext tempCxt; /* short-term context for calling functions */
80 TupleDesc tupdesc; /* index's tuple descriptor */
81 TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an
84 FmgrInfo consistentFn[INDEX_MAX_KEYS];
85 FmgrInfo unionFn[INDEX_MAX_KEYS];
86 FmgrInfo compressFn[INDEX_MAX_KEYS];
87 FmgrInfo decompressFn[INDEX_MAX_KEYS];
88 FmgrInfo penaltyFn[INDEX_MAX_KEYS];
89 FmgrInfo picksplitFn[INDEX_MAX_KEYS];
90 FmgrInfo equalFn[INDEX_MAX_KEYS];
91 FmgrInfo distanceFn[INDEX_MAX_KEYS];
92 FmgrInfo fetchFn[INDEX_MAX_KEYS];
94 /* Collations to pass to the support functions */
95 Oid supportCollation[INDEX_MAX_KEYS];
100 * During a GiST index search, we must maintain a queue of unvisited items,
101 * which can be either individual heap tuples or whole index pages. If it
102 * is an ordered search, the unvisited items should be visited in distance
103 * order. Unvisited items at the same distance should be visited in
104 * depth-first order, that is heap items first, then lower index pages, then
105 * upper index pages; this rule avoids doing extra work during a search that
106 * ends early due to LIMIT.
108 * To perform an ordered search, we use an RBTree to manage the distance-order
109 * queue. Each GISTSearchTreeItem stores all unvisited items of the same
110 * distance; they are GISTSearchItems chained together via their next fields.
112 * In a non-ordered search (no order-by operators), the RBTree degenerates
113 * to a single item, which we use as a queue of unvisited index pages only.
114 * In this case matched heap items from the current index leaf page are
115 * remembered in GISTScanOpaqueData.pageData[] and returned directly from
116 * there, instead of building a separate GISTSearchItem for each one.
119 /* Individual heap tuple to be visited */
120 typedef struct GISTSearchHeapItem
122 ItemPointerData heapPtr;
123 bool recheck; /* T if quals must be rechecked */
124 bool recheckDistances; /* T if distances must be rechecked */
125 IndexTuple ftup; /* data fetched back from the index, used in
126 * index-only scans */
127 } GISTSearchHeapItem;
129 /* Unvisited item, either index page or heap tuple */
130 typedef struct GISTSearchItem
132 pairingheap_node phNode;
133 BlockNumber blkno; /* index page number, or InvalidBlockNumber */
136 GistNSN parentlsn; /* parent page's LSN, if index page */
137 /* we must store parentlsn to detect whether a split occurred */
138 GISTSearchHeapItem heap; /* heap info, if heap tuple */
140 double distances[FLEXIBLE_ARRAY_MEMBER]; /* numberOfOrderBys
144 #define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber)
146 #define SizeOfGISTSearchItem(n_distances) (offsetof(GISTSearchItem, distances) + sizeof(double) * (n_distances))
149 * GISTScanOpaqueData: private state for a scan of a GiST index
151 typedef struct GISTScanOpaqueData
153 GISTSTATE *giststate; /* index information, see above */
154 Oid *orderByTypes; /* datatypes of ORDER BY expressions */
156 pairingheap *queue; /* queue of unvisited items */
157 MemoryContext queueCxt; /* context holding the queue */
158 bool qual_ok; /* false if qual can never be satisfied */
159 bool firstCall; /* true until first gistgettuple call */
161 /* pre-allocated workspace arrays */
162 double *distances; /* output area for gistindex_keytest */
164 /* In a non-ordered search, returnable heap items are stored here: */
165 GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
166 OffsetNumber nPageData; /* number of valid items in array */
167 OffsetNumber curPageData; /* next item to return */
168 MemoryContext pageDataCxt; /* context holding the fetched tuples, for
169 * index-only scans */
170 } GISTScanOpaqueData;
172 typedef GISTScanOpaqueData *GISTScanOpaque;
177 #define XLOG_GIST_PAGE_UPDATE 0x00
178 /* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
179 #define XLOG_GIST_PAGE_SPLIT 0x30
180 /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
181 #define XLOG_GIST_CREATE_INDEX 0x50
182 /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */
185 * Backup Blk 0: updated page.
186 * Backup Blk 1: If this operation completes a page split, by inserting a
187 * downlink for the split page, the left half of the split
189 typedef struct gistxlogPageUpdate
191 /* number of deleted offsets */
196 * In payload of blk 0 : 1. todelete OffsetNumbers 2. tuples to insert
198 } gistxlogPageUpdate;
201 * Backup Blk 0: If this operation completes a page split, by inserting a
202 * downlink for the split page, the left half of the split
203 * Backup Blk 1 - npage: split pages (1 is the original page)
205 typedef struct gistxlogPageSplit
207 BlockNumber origrlink; /* rightlink of the page before split */
208 GistNSN orignsn; /* NSN of the page before split */
209 bool origleaf; /* was splitted page a leaf page? */
211 uint16 npage; /* # of pages in the split */
212 bool markfollowright; /* set F_FOLLOW_RIGHT flags */
215 * follow: 1. gistxlogPage and array of IndexTupleData per page
219 typedef struct gistxlogPage
222 int num; /* number of index tuples following */
225 /* SplitedPageLayout - gistSplit function result */
226 typedef struct SplitedPageLayout
229 IndexTupleData *list;
231 IndexTuple itup; /* union key for page */
232 Page page; /* to operate */
233 Buffer buffer; /* to write after all proceed */
235 struct SplitedPageLayout *next;
239 * GISTInsertStack used for locking buffers and transfer arguments during
242 typedef struct GISTInsertStack
250 * log sequence number from page->lsn to recognize page update and compare
251 * it with page's nsn to recognize page split
255 /* offset of the downlink in the parent page, that points to this page */
256 OffsetNumber downlinkoffnum;
258 /* pointer to parent */
259 struct GISTInsertStack *parent;
262 /* Working state and results for multi-column split logic in gistsplit.c */
263 typedef struct GistSplitVector
265 GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */
267 Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in
268 * splitVector.spl_left */
269 bool spl_lisnull[INDEX_MAX_KEYS];
271 Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in
272 * splitVector.spl_right */
273 bool spl_risnull[INDEX_MAX_KEYS];
275 bool *spl_dontcare; /* flags tuples which could go to either side
276 * of the split for zero penalty */
282 Size freespace; /* free space to be left */
284 GISTInsertStack *stack;
287 /* root page of a gist index */
288 #define GIST_ROOT_BLKNO 0
291 * Before PostgreSQL 9.1, we used rely on so-called "invalid tuples" on inner
292 * pages to finish crash recovery of incomplete page splits. If a crash
293 * happened in the middle of a page split, so that the downlink pointers were
294 * not yet inserted, crash recovery inserted a special downlink pointer. The
295 * semantics of an invalid tuple was that it if you encounter one in a scan,
296 * it must always be followed, because we don't know if the tuples on the
297 * child page match or not.
299 * We no longer create such invalid tuples, we now mark the left-half of such
300 * an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the
301 * split properly the next time we need to insert on that page. To retain
302 * on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as
303 * the offset number of all inner tuples. If we encounter any invalid tuples
304 * with 0xfffe during insertion, we throw an error, though scans still handle
305 * them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1
306 * gist index which already has invalid tuples in it because of a crash. That
307 * should be rare, and you are recommended to REINDEX anyway if you have any
308 * invalid tuples in an index, so throwing an error is as far as we go with
311 #define TUPLE_IS_VALID 0xffff
312 #define TUPLE_IS_INVALID 0xfffe
314 #define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
315 #define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
321 * A buffer attached to an internal node, used when building an index in
326 BlockNumber nodeBlocknum; /* index block # this buffer is for */
327 int32 blocksCount; /* current # of blocks occupied by buffer */
329 BlockNumber pageBlocknum; /* temporary file block # */
330 GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */
332 /* is this buffer queued for emptying? */
333 bool queuedForEmptying;
335 /* is this a temporary copy, not in the hash table? */
338 int level; /* 0 == leaf */
342 * Does specified level have buffers? (Beware of multiple evaluation of
345 #define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
346 ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
347 (nlevel) != (gfbb)->rootlevel)
349 /* Is specified buffer at least half-filled (should be queued for emptying)? */
350 #define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
351 ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
354 * Is specified buffer full? Our buffers can actually grow indefinitely,
355 * beyond the "maximum" size, so this just means whether the buffer has grown
356 * beyond the nominal maximum size.
358 #define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
359 ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
362 * Data structure with general information about build buffers.
364 typedef struct GISTBuildBuffers
366 /* Persistent memory context for the buffers and metadata. */
367 MemoryContext context;
369 BufFile *pfile; /* Temporary file to store buffers in */
370 long nFileBlocks; /* Current size of the temporary file */
373 * resizable array of free blocks.
376 int nFreeBlocks; /* # of currently free blocks in the array */
377 int freeBlocksLen; /* current allocated length of the array */
379 /* Hash for buffers by block number */
380 HTAB *nodeBuffersTab;
382 /* List of buffers scheduled for emptying */
383 List *bufferEmptyingQueue;
386 * Parameters to the buffering build algorithm. levelStep determines which
387 * levels in the tree have buffers, and pagesPerBuffer determines how
388 * large each buffer is.
393 /* Array of lists of buffers on each level, for final emptying */
394 List **buffersOnLevels;
395 int buffersOnLevelsLen;
398 * Dynamically-sized array of buffers that currently have their last page
399 * loaded in main memory.
401 GISTNodeBuffer **loadedBuffers;
402 int loadedBuffersCount; /* # of entries in loadedBuffers */
403 int loadedBuffersLen; /* allocated size of loadedBuffers */
405 /* Level of the current root node (= height of the index tree - 1) */
410 * Storage type for GiST's reloptions
412 typedef struct GiSTOptions
414 int32 vl_len_; /* varlena header (do not touch directly!) */
415 int fillfactor; /* page fill factor in percent (0..100) */
416 int bufferingModeOffset; /* use buffering build? */
420 extern Datum gistbuildempty(PG_FUNCTION_ARGS);
421 extern Datum gistinsert(PG_FUNCTION_ARGS);
422 extern Datum gistcanreturn(PG_FUNCTION_ARGS);
423 extern MemoryContext createTempGistContext(void);
424 extern GISTSTATE *initGISTstate(Relation index);
425 extern void freeGISTstate(GISTSTATE *giststate);
426 extern void gistdoinsert(Relation r,
429 GISTSTATE *GISTstate);
431 /* A List of these is returned from gistplacetopage() in *splitinfo */
434 Buffer buf; /* the split page "half" */
435 IndexTuple downlink; /* downlink for this half. */
438 extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
440 IndexTuple *itup, int ntup,
441 OffsetNumber oldoffnum, BlockNumber *newblkno,
446 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
447 int len, GISTSTATE *giststate);
450 extern void gist_redo(XLogReaderState *record);
451 extern void gist_desc(StringInfo buf, XLogReaderState *record);
452 extern const char *gist_identify(uint8 info);
453 extern void gist_xlog_startup(void);
454 extern void gist_xlog_cleanup(void);
456 extern XLogRecPtr gistXLogUpdate(RelFileNode node, Buffer buffer,
457 OffsetNumber *todelete, int ntodelete,
458 IndexTuple *itup, int ntup,
461 extern XLogRecPtr gistXLogSplit(RelFileNode node,
462 BlockNumber blkno, bool page_is_leaf,
463 SplitedPageLayout *dist,
464 BlockNumber origrlink, GistNSN oldnsn,
465 Buffer leftchild, bool markfollowright);
468 extern Datum gistgettuple(PG_FUNCTION_ARGS);
469 extern Datum gistgetbitmap(PG_FUNCTION_ARGS);
473 #define GiSTPageSize \
474 ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
476 #define GIST_MIN_FILLFACTOR 10
477 #define GIST_DEFAULT_FILLFACTOR 90
479 extern Datum gistoptions(PG_FUNCTION_ARGS);
480 extern bool gistfitpage(IndexTuple *itvec, int len);
481 extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace);
482 extern void gistcheckpage(Relation rel, Buffer buf);
483 extern Buffer gistNewBuffer(Relation r);
484 extern void gistfillbuffer(Page page, IndexTuple *itup, int len,
486 extern IndexTuple *gistextractpage(Page page, int *len /* out */ );
487 extern IndexTuple *gistjoinvector(
488 IndexTuple *itvec, int *len,
489 IndexTuple *additvec, int addlen);
490 extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen);
492 extern IndexTuple gistunion(Relation r, IndexTuple *itvec,
493 int len, GISTSTATE *giststate);
494 extern IndexTuple gistgetadjusted(Relation r,
497 GISTSTATE *giststate);
498 extern IndexTuple gistFormTuple(GISTSTATE *giststate,
499 Relation r, Datum *attdata, bool *isnull, bool isleaf);
501 extern OffsetNumber gistchoose(Relation r, Page p,
503 GISTSTATE *giststate);
505 extern void GISTInitBuffer(Buffer b, uint32 f);
506 extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
507 Datum k, Relation r, Page pg, OffsetNumber o,
508 bool l, bool isNull);
510 extern float gistpenalty(GISTSTATE *giststate, int attno,
511 GISTENTRY *key1, bool isNull1,
512 GISTENTRY *key2, bool isNull2);
513 extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
514 Datum *attr, bool *isnull);
515 extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b);
516 extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
517 OffsetNumber o, GISTENTRY *attdata, bool *isnull);
518 extern IndexTuple gistFetchTuple(GISTSTATE *giststate, Relation r,
520 extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
521 GISTENTRY *entry1, bool isnull1,
522 GISTENTRY *entry2, bool isnull2,
523 Datum *dst, bool *dstisnull);
525 extern XLogRecPtr gistGetFakeLSN(Relation rel);
528 extern Datum gistbulkdelete(PG_FUNCTION_ARGS);
529 extern Datum gistvacuumcleanup(PG_FUNCTION_ARGS);
532 extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
533 int len, GISTSTATE *giststate,
538 extern Datum gistbuild(PG_FUNCTION_ARGS);
539 extern void gistValidateBufferingOption(char *value);
541 /* gistbuildbuffers.c */
542 extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
544 extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
545 GISTSTATE *giststate,
546 BlockNumber blkno, int level);
547 extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
548 GISTNodeBuffer *nodeBuffer, IndexTuple item);
549 extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
550 GISTNodeBuffer *nodeBuffer, IndexTuple *item);
551 extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
552 extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
553 GISTSTATE *giststate, Relation r,
554 int level, Buffer buffer,
556 extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
558 #endif /* GIST_PRIVATE_H */