3 * Implementation of BRIN indexes for Postgres
5 * See src/backend/access/brin/README for details.
7 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/brin/brin.c
14 * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
18 #include "access/brin.h"
19 #include "access/brin_page.h"
20 #include "access/brin_pageops.h"
21 #include "access/brin_xlog.h"
22 #include "access/reloptions.h"
23 #include "access/relscan.h"
24 #include "access/xloginsert.h"
25 #include "catalog/index.h"
26 #include "catalog/pg_am.h"
27 #include "miscadmin.h"
29 #include "postmaster/autovacuum.h"
30 #include "storage/bufmgr.h"
31 #include "storage/freespace.h"
32 #include "utils/builtins.h"
33 #include "utils/index_selfuncs.h"
34 #include "utils/memutils.h"
35 #include "utils/rel.h"
39 * We use a BrinBuildState during initial construction of a BRIN index.
40 * The running state is kept in a BrinMemTuple.
42 typedef struct BrinBuildState
46 Buffer bs_currentInsertBuf;
47 BlockNumber bs_pagesPerRange;
48 BlockNumber bs_currRangeStart;
49 BrinRevmap *bs_rmAccess;
51 BrinMemTuple *bs_dtuple;
55 * Struct used as "opaque" during index scans
57 typedef struct BrinOpaque
59 BlockNumber bo_pagesPerRange;
60 BrinRevmap *bo_rmAccess;
64 #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
66 static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
67 BrinRevmap *revmap, BlockNumber pagesPerRange);
68 static void terminate_brin_buildstate(BrinBuildState *state);
69 static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
70 bool include_partial, double *numSummarized, double *numExisting);
71 static void form_and_insert_tuple(BrinBuildState *state);
72 static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
74 static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
78 * BRIN handler function: return IndexAmRoutine with access method parameters
82 brinhandler(PG_FUNCTION_ARGS)
84 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
86 amroutine->amstrategies = 0;
87 amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;
88 amroutine->amcanorder = false;
89 amroutine->amcanorderbyop = false;
90 amroutine->amcanbackward = false;
91 amroutine->amcanunique = false;
92 amroutine->amcanmulticol = true;
93 amroutine->amoptionalkey = true;
94 amroutine->amsearcharray = false;
95 amroutine->amsearchnulls = true;
96 amroutine->amstorage = true;
97 amroutine->amclusterable = false;
98 amroutine->ampredlocks = false;
99 amroutine->amcanparallel = false;
100 amroutine->amkeytype = InvalidOid;
102 amroutine->ambuild = brinbuild;
103 amroutine->ambuildempty = brinbuildempty;
104 amroutine->aminsert = brininsert;
105 amroutine->ambulkdelete = brinbulkdelete;
106 amroutine->amvacuumcleanup = brinvacuumcleanup;
107 amroutine->amcanreturn = NULL;
108 amroutine->amcostestimate = brincostestimate;
109 amroutine->amoptions = brinoptions;
110 amroutine->amproperty = NULL;
111 amroutine->amvalidate = brinvalidate;
112 amroutine->ambeginscan = brinbeginscan;
113 amroutine->amrescan = brinrescan;
114 amroutine->amgettuple = NULL;
115 amroutine->amgetbitmap = bringetbitmap;
116 amroutine->amendscan = brinendscan;
117 amroutine->ammarkpos = NULL;
118 amroutine->amrestrpos = NULL;
119 amroutine->amestimateparallelscan = NULL;
120 amroutine->aminitparallelscan = NULL;
121 amroutine->amparallelrescan = NULL;
123 PG_RETURN_POINTER(amroutine);
127 * A tuple in the heap is being inserted. To keep a brin index up to date,
128 * we need to obtain the relevant index tuple and compare its stored values
129 * with those of the new tuple. If the tuple values are not consistent with
130 * the summary tuple, we need to update the index tuple.
132 * If autosummarization is enabled, check if we need to summarize the previous
135 * If the range is not currently summarized (i.e. the revmap returns NULL for
136 * it), there's nothing to do for this tuple.
139 brininsert(Relation idxRel, Datum *values, bool *nulls,
140 ItemPointer heaptid, Relation heapRel,
141 IndexUniqueCheck checkUnique,
142 IndexInfo *indexInfo)
144 BlockNumber pagesPerRange;
145 BlockNumber origHeapBlk;
147 BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
149 Buffer buf = InvalidBuffer;
150 MemoryContext tupcxt = NULL;
151 MemoryContext oldcxt = CurrentMemoryContext;
152 bool autosummarize = BrinGetAutoSummarize(idxRel);
154 revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
157 * origHeapBlk is the block number where the insertion occurred. heapBlk
158 * is the first block in the corresponding page range.
160 origHeapBlk = ItemPointerGetBlockNumber(heaptid);
161 heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
165 bool need_insert = false;
171 CHECK_FOR_INTERRUPTS();
174 * If auto-summarization is enabled and we just inserted the first
175 * tuple into the first block of a new non-first page range, request a
176 * summarization run of the previous range.
180 heapBlk == origHeapBlk &&
181 ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
183 BlockNumber lastPageRange = heapBlk - 1;
184 BrinTuple *lastPageTuple;
187 brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
188 NULL, BUFFER_LOCK_SHARE, NULL);
190 AutoVacuumRequestWork(AVW_BRINSummarizeRange,
191 RelationGetRelid(idxRel),
194 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
197 brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
198 NULL, BUFFER_LOCK_SHARE, NULL);
200 /* if range is unsummarized, there's nothing to do */
204 /* First time through in this statement? */
207 MemoryContextSwitchTo(indexInfo->ii_Context);
208 bdesc = brin_build_desc(idxRel);
209 indexInfo->ii_AmCache = (void *) bdesc;
210 MemoryContextSwitchTo(oldcxt);
212 /* First time through in this brininsert call? */
215 tupcxt = AllocSetContextCreate(CurrentMemoryContext,
217 ALLOCSET_DEFAULT_SIZES);
218 MemoryContextSwitchTo(tupcxt);
221 dtup = brin_deform_tuple(bdesc, brtup, NULL);
224 * Compare the key values of the new tuple to the stored index values;
225 * our deformed tuple will get updated if the new tuple doesn't fit
226 * the original range (note this means we can't break out of the loop
227 * early). Make a note of whether this happens, so that we know to
228 * insert the modified tuple later.
230 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
236 bval = &dtup->bt_columns[keyno];
237 addValue = index_getprocinfo(idxRel, keyno + 1,
238 BRIN_PROCNUM_ADDVALUE);
239 result = FunctionCall4Coll(addValue,
240 idxRel->rd_indcollation[keyno],
241 PointerGetDatum(bdesc),
242 PointerGetDatum(bval),
245 /* if that returned true, we need to insert the updated tuple */
246 need_insert |= DatumGetBool(result);
252 * The tuple is consistent with the new values, so there's nothing
255 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
259 Page page = BufferGetPage(buf);
260 ItemId lp = PageGetItemId(page, off);
268 * Make a copy of the old tuple, so that we can compare it after
269 * re-acquiring the lock.
271 origsz = ItemIdGetLength(lp);
272 origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
275 * Before releasing the lock, check if we can attempt a same-page
276 * update. Another process could insert a tuple concurrently in
277 * the same page though, so downstream we must be prepared to cope
278 * if this turns out to not be possible after all.
280 newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
281 samepage = brin_can_do_samepage_update(buf, origsz, newsz);
282 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
285 * Try to update the tuple. If this doesn't work for whatever
286 * reason, we need to restart from the top; the revmap might be
287 * pointing at a different tuple for this block now, so we need to
288 * recompute to ensure both our new heap tuple and the other
289 * inserter's are covered by the combined tuple. It might be that
290 * we don't need to update at all.
292 if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
293 buf, off, origtup, origsz, newtup, newsz,
296 /* no luck; start over */
297 MemoryContextResetAndDeleteChildren(tupcxt);
306 brinRevmapTerminate(revmap);
307 if (BufferIsValid(buf))
309 MemoryContextSwitchTo(oldcxt);
311 MemoryContextDelete(tupcxt);
317 * Initialize state for a BRIN index scan.
319 * We read the metapage here to determine the pages-per-range number that this
320 * index was built with. Note that since this cannot be changed while we're
321 * holding lock on index, it's not necessary to recompute it during brinrescan.
324 brinbeginscan(Relation r, int nkeys, int norderbys)
329 scan = RelationGetIndexScan(r, nkeys, norderbys);
331 opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque));
332 opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange,
334 opaque->bo_bdesc = brin_build_desc(r);
335 scan->opaque = opaque;
341 * Execute the index scan.
343 * This works by reading index TIDs from the revmap, and obtaining the index
344 * tuples pointed to by them; the summary values in the index tuples are
345 * compared to the scan keys. We return into the TID bitmap all the pages in
346 * ranges corresponding to index tuples that match the scan keys.
348 * If a TID from the revmap is read as InvalidTID, we know that range is
349 * unsummarized. Pages in those ranges need to be returned regardless of scan
353 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
355 Relation idxRel = scan->indexRelation;
356 Buffer buf = InvalidBuffer;
364 FmgrInfo *consistentFn;
365 MemoryContext oldcxt;
366 MemoryContext perRangeCxt;
368 BrinTuple *btup = NULL;
371 opaque = (BrinOpaque *) scan->opaque;
372 bdesc = opaque->bo_bdesc;
373 pgstat_count_index_scan(idxRel);
376 * We need to know the size of the table so that we know how long to
377 * iterate on the revmap.
379 heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
380 heapRel = heap_open(heapOid, AccessShareLock);
381 nblocks = RelationGetNumberOfBlocks(heapRel);
382 heap_close(heapRel, AccessShareLock);
385 * Make room for the consistent support procedures of indexed columns. We
386 * don't look them up here; we do that lazily the first time we see a scan
387 * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
389 consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts);
391 /* allocate an initial in-memory tuple, out of the per-range memcxt */
392 dtup = brin_new_memtuple(bdesc);
395 * Setup and use a per-range memory context, which is reset every time we
396 * loop below. This avoids having to free the tuples within the loop.
398 perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
400 ALLOCSET_DEFAULT_SIZES);
401 oldcxt = MemoryContextSwitchTo(perRangeCxt);
404 * Now scan the revmap. We start by querying for heap page 0,
405 * incrementing by the number of pages per range; this gives us a full
408 for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
411 bool gottuple = false;
416 CHECK_FOR_INTERRUPTS();
418 MemoryContextResetAndDeleteChildren(perRangeCxt);
420 tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
421 &off, &size, BUFFER_LOCK_SHARE,
426 btup = brin_copy_tuple(tup, size, btup, &btupsz);
427 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
431 * For page ranges with no indexed tuple, we must return the whole
432 * range; otherwise, compare it to the scan keys.
440 dtup = brin_deform_tuple(bdesc, btup, dtup);
441 if (dtup->bt_placeholder)
444 * Placeholder tuples are always returned, regardless of the
445 * values stored in them.
454 * Compare scan keys with summary values stored for the range.
455 * If scan keys are matched, the page range must be added to
456 * the bitmap. We initially assume the range needs to be
457 * added; in particular this serves the case where there are
461 for (keyno = 0; keyno < scan->numberOfKeys; keyno++)
463 ScanKey key = &scan->keyData[keyno];
464 AttrNumber keyattno = key->sk_attno;
465 BrinValues *bval = &dtup->bt_columns[keyattno - 1];
469 * The collation of the scan key must match the collation
470 * used in the index column (but only if the search is not
471 * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using
474 Assert((key->sk_flags & SK_ISNULL) ||
475 (key->sk_collation ==
476 TupleDescAttr(bdesc->bd_tupdesc,
477 keyattno - 1)->attcollation));
479 /* First time this column? look up consistent function */
480 if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
484 tmp = index_getprocinfo(idxRel, keyattno,
485 BRIN_PROCNUM_CONSISTENT);
486 fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
487 CurrentMemoryContext);
491 * Check whether the scan key is consistent with the page
492 * range values; if so, have the pages in the range added
493 * to the output bitmap.
495 * When there are multiple scan keys, failure to meet the
496 * criteria for a single one of them is enough to discard
497 * the range as a whole, so break out of the loop as soon
498 * as a false return value is obtained.
500 add = FunctionCall3Coll(&consistentFn[keyattno - 1],
502 PointerGetDatum(bdesc),
503 PointerGetDatum(bval),
504 PointerGetDatum(key));
505 addrange = DatumGetBool(add);
512 /* add the pages in the range to the output bitmap, if needed */
517 for (pageno = heapBlk;
518 pageno <= heapBlk + opaque->bo_pagesPerRange - 1;
521 MemoryContextSwitchTo(oldcxt);
522 tbm_add_page(tbm, pageno);
524 MemoryContextSwitchTo(perRangeCxt);
529 MemoryContextSwitchTo(oldcxt);
530 MemoryContextDelete(perRangeCxt);
532 if (buf != InvalidBuffer)
536 * XXX We have an approximation of the number of *pages* that our scan
537 * returns, but we don't have a precise idea of the number of heap tuples
540 return totalpages * 10;
544 * Re-initialize state for a BRIN index scan
547 brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
548 ScanKey orderbys, int norderbys)
551 * Other index AMs preprocess the scan keys at this point, or sometime
552 * early during the scan; this lets them optimize by removing redundant
553 * keys, or doing early returns when they are impossible to satisfy; see
554 * _bt_preprocess_keys for an example. Something like that could be added
558 if (scankey && scan->numberOfKeys > 0)
559 memmove(scan->keyData, scankey,
560 scan->numberOfKeys * sizeof(ScanKeyData));
564 * Close down a BRIN index scan
567 brinendscan(IndexScanDesc scan)
569 BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
571 brinRevmapTerminate(opaque->bo_rmAccess);
572 brin_free_desc(opaque->bo_bdesc);
577 * Per-heap-tuple callback for IndexBuildHeapScan.
579 * Note we don't worry about the page range at the end of the table here; it is
580 * present in the build state struct after we're called the last time, but not
581 * inserted into the index. Caller must ensure to do so, if appropriate.
584 brinbuildCallback(Relation index,
591 BrinBuildState *state = (BrinBuildState *) brstate;
592 BlockNumber thisblock;
595 thisblock = ItemPointerGetBlockNumber(&htup->t_self);
598 * If we're in a block that belongs to a future range, summarize what
599 * we've got and start afresh. Note the scan might have skipped many
600 * pages, if they were devoid of live tuples; make sure to insert index
601 * tuples for those too.
603 while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
607 "brinbuildCallback: completed a range: %u--%u",
608 state->bs_currRangeStart,
609 state->bs_currRangeStart + state->bs_pagesPerRange));
611 /* create the index tuple and insert it */
612 form_and_insert_tuple(state);
614 /* set state to correspond to the next range */
615 state->bs_currRangeStart += state->bs_pagesPerRange;
617 /* re-initialize state for it */
618 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
621 /* Accumulate the current tuple into the running state */
622 for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++)
626 Form_pg_attribute attr = TupleDescAttr(state->bs_bdesc->bd_tupdesc, i);
628 col = &state->bs_dtuple->bt_columns[i];
629 addValue = index_getprocinfo(index, i + 1,
630 BRIN_PROCNUM_ADDVALUE);
633 * Update dtuple state, if and as necessary.
635 FunctionCall4Coll(addValue,
637 PointerGetDatum(state->bs_bdesc),
638 PointerGetDatum(col),
639 values[i], isnull[i]);
644 * brinbuild() -- build a new BRIN index.
647 brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
649 IndexBuildResult *result;
653 BrinBuildState *state;
655 BlockNumber pagesPerRange;
658 * We expect to be called exactly once for any index relation.
660 if (RelationGetNumberOfBlocks(index) != 0)
661 elog(ERROR, "index \"%s\" already contains data",
662 RelationGetRelationName(index));
665 * Critical section not required, because on error the creation of the
666 * whole relation will be rolled back.
669 meta = ReadBuffer(index, P_NEW);
670 Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
671 LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
673 brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
674 BRIN_CURRENT_VERSION);
675 MarkBufferDirty(meta);
677 if (RelationNeedsWAL(index))
679 xl_brin_createidx xlrec;
683 xlrec.version = BRIN_CURRENT_VERSION;
684 xlrec.pagesPerRange = BrinGetPagesPerRange(index);
687 XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
688 XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
690 recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
692 page = BufferGetPage(meta);
693 PageSetLSN(page, recptr);
696 UnlockReleaseBuffer(meta);
699 * Initialize our state, including the deformed tuple state.
701 revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
702 state = initialize_brin_buildstate(index, revmap, pagesPerRange);
705 * Now scan the relation. No syncscan allowed here because we want the
706 * heap blocks in physical order.
708 reltuples = IndexBuildHeapScan(heap, index, indexInfo, false,
709 brinbuildCallback, (void *) state, NULL);
711 /* process the final batch */
712 form_and_insert_tuple(state);
714 /* release resources */
715 idxtuples = state->bs_numtuples;
716 brinRevmapTerminate(state->bs_rmAccess);
717 terminate_brin_buildstate(state);
722 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
724 result->heap_tuples = reltuples;
725 result->index_tuples = idxtuples;
731 brinbuildempty(Relation index)
735 /* An empty BRIN index has a metapage only. */
737 ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
738 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
740 /* Initialize and xlog metabuffer. */
741 START_CRIT_SECTION();
742 brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
743 BRIN_CURRENT_VERSION);
744 MarkBufferDirty(metabuf);
745 log_newpage_buffer(metabuf, true);
748 UnlockReleaseBuffer(metabuf);
753 * Since there are no per-heap-tuple index tuples in BRIN indexes,
754 * there's not a lot we can do here.
756 * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
757 * tuple is deleted), meaning the need to re-run summarization on the affected
758 * range. Would need to add an extra flag in brintuples for that.
760 IndexBulkDeleteResult *
761 brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
762 IndexBulkDeleteCallback callback, void *callback_state)
764 /* allocate stats if first time through, else re-use existing struct */
766 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
772 * This routine is in charge of "vacuuming" a BRIN index: we just summarize
773 * ranges that are currently unsummarized.
775 IndexBulkDeleteResult *
776 brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
780 /* No-op in ANALYZE ONLY mode */
781 if (info->analyze_only)
785 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
786 stats->num_pages = RelationGetNumberOfBlocks(info->index);
787 /* rest of stats is initialized by zeroing */
789 heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
792 brin_vacuum_scan(info->index, info->strategy);
794 brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
795 &stats->num_index_tuples, &stats->num_index_tuples);
797 heap_close(heapRel, AccessShareLock);
803 * reloptions processor for BRIN indexes
806 brinoptions(Datum reloptions, bool validate)
808 relopt_value *options;
811 static const relopt_parse_elt tab[] = {
812 {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
813 {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
816 options = parseRelOptions(reloptions, validate, RELOPT_KIND_BRIN,
819 /* if none set, we're done */
823 rdopts = allocateReloptStruct(sizeof(BrinOptions), options, numoptions);
825 fillRelOptions((void *) rdopts, sizeof(BrinOptions), options, numoptions,
826 validate, tab, lengthof(tab));
830 return (bytea *) rdopts;
834 * SQL-callable function to scan through an index and summarize all ranges
835 * that are not currently summarized.
838 brin_summarize_new_values(PG_FUNCTION_ARGS)
840 Datum relation = PG_GETARG_DATUM(0);
842 return DirectFunctionCall2(brin_summarize_range,
844 Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
848 * SQL-callable function to summarize the indicated page range, if not already
849 * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
850 * unsummarized ranges are summarized.
853 brin_summarize_range(PG_FUNCTION_ARGS)
855 Oid indexoid = PG_GETARG_OID(0);
856 int64 heapBlk64 = PG_GETARG_INT64(1);
861 double numSummarized = 0;
863 if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
865 char *blk = psprintf(INT64_FORMAT, heapBlk64);
868 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
869 errmsg("block number out of range: %s", blk)));
871 heapBlk = (BlockNumber) heapBlk64;
874 * We must lock table before index to avoid deadlocks. However, if the
875 * passed indexoid isn't an index then IndexGetRelation() will fail.
876 * Rather than emitting a not-very-helpful error message, postpone
877 * complaining, expecting that the is-it-an-index test below will fail.
879 heapoid = IndexGetRelation(indexoid, true);
880 if (OidIsValid(heapoid))
881 heapRel = heap_open(heapoid, ShareUpdateExclusiveLock);
885 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
887 /* Must be a BRIN index */
888 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
889 indexRel->rd_rel->relam != BRIN_AM_OID)
891 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
892 errmsg("\"%s\" is not a BRIN index",
893 RelationGetRelationName(indexRel))));
895 /* User must own the index (comparable to privileges needed for VACUUM) */
896 if (!pg_class_ownercheck(indexoid, GetUserId()))
897 aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
898 RelationGetRelationName(indexRel));
901 * Since we did the IndexGetRelation call above without any lock, it's
902 * barely possible that a race against an index drop/recreation could have
903 * netted us the wrong table. Recheck.
905 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
907 (errcode(ERRCODE_UNDEFINED_TABLE),
908 errmsg("could not open parent table of index %s",
909 RelationGetRelationName(indexRel))));
912 brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
914 relation_close(indexRel, ShareUpdateExclusiveLock);
915 relation_close(heapRel, ShareUpdateExclusiveLock);
917 PG_RETURN_INT32((int32) numSummarized);
921 * SQL-callable interface to mark a range as no longer summarized
924 brin_desummarize_range(PG_FUNCTION_ARGS)
926 Oid indexoid = PG_GETARG_OID(0);
927 int64 heapBlk64 = PG_GETARG_INT64(1);
934 if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
936 char *blk = psprintf(INT64_FORMAT, heapBlk64);
939 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
940 errmsg("block number out of range: %s", blk)));
942 heapBlk = (BlockNumber) heapBlk64;
945 * We must lock table before index to avoid deadlocks. However, if the
946 * passed indexoid isn't an index then IndexGetRelation() will fail.
947 * Rather than emitting a not-very-helpful error message, postpone
948 * complaining, expecting that the is-it-an-index test below will fail.
950 heapoid = IndexGetRelation(indexoid, true);
951 if (OidIsValid(heapoid))
952 heapRel = heap_open(heapoid, ShareUpdateExclusiveLock);
956 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
958 /* Must be a BRIN index */
959 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
960 indexRel->rd_rel->relam != BRIN_AM_OID)
962 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
963 errmsg("\"%s\" is not a BRIN index",
964 RelationGetRelationName(indexRel))));
966 /* User must own the index (comparable to privileges needed for VACUUM) */
967 if (!pg_class_ownercheck(indexoid, GetUserId()))
968 aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
969 RelationGetRelationName(indexRel));
972 * Since we did the IndexGetRelation call above without any lock, it's
973 * barely possible that a race against an index drop/recreation could have
974 * netted us the wrong table. Recheck.
976 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
978 (errcode(ERRCODE_UNDEFINED_TABLE),
979 errmsg("could not open parent table of index %s",
980 RelationGetRelationName(indexRel))));
982 /* the revmap does the hard work */
985 done = brinRevmapDesummarizeRange(indexRel, heapBlk);
989 relation_close(indexRel, ShareUpdateExclusiveLock);
990 relation_close(heapRel, ShareUpdateExclusiveLock);
996 * Build a BrinDesc used to create or scan a BRIN index
999 brin_build_desc(Relation rel)
1001 BrinOpcInfo **opcinfo;
1004 int totalstored = 0;
1008 MemoryContext oldcxt;
1010 cxt = AllocSetContextCreate(CurrentMemoryContext,
1012 ALLOCSET_SMALL_SIZES);
1013 oldcxt = MemoryContextSwitchTo(cxt);
1014 tupdesc = RelationGetDescr(rel);
1017 * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1018 * the number of columns stored, since the number is opclass-defined.
1020 opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts);
1021 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1023 FmgrInfo *opcInfoFn;
1024 Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1026 opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1028 opcinfo[keyno] = (BrinOpcInfo *)
1029 DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
1030 totalstored += opcinfo[keyno]->oi_nstored;
1033 /* Allocate our result struct and fill it in */
1034 totalsize = offsetof(BrinDesc, bd_info) +
1035 sizeof(BrinOpcInfo *) * tupdesc->natts;
1037 bdesc = palloc(totalsize);
1038 bdesc->bd_context = cxt;
1039 bdesc->bd_index = rel;
1040 bdesc->bd_tupdesc = tupdesc;
1041 bdesc->bd_disktdesc = NULL; /* generated lazily */
1042 bdesc->bd_totalstored = totalstored;
1044 for (keyno = 0; keyno < tupdesc->natts; keyno++)
1045 bdesc->bd_info[keyno] = opcinfo[keyno];
1048 MemoryContextSwitchTo(oldcxt);
1054 brin_free_desc(BrinDesc *bdesc)
1056 /* make sure the tupdesc is still valid */
1057 Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1058 /* no need for retail pfree */
1059 MemoryContextDelete(bdesc->bd_context);
1063 * Fetch index's statistical data into *stats
1066 brinGetStats(Relation index, BrinStatsData *stats)
1070 BrinMetaPageData *metadata;
1072 metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1073 LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1074 metapage = BufferGetPage(metabuffer);
1075 metadata = (BrinMetaPageData *) PageGetContents(metapage);
1077 stats->pagesPerRange = metadata->pagesPerRange;
1078 stats->revmapNumPages = metadata->lastRevmapPage - 1;
1080 UnlockReleaseBuffer(metabuffer);
1084 * Initialize a BrinBuildState appropriate to create tuples on the given index.
1086 static BrinBuildState *
1087 initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
1088 BlockNumber pagesPerRange)
1090 BrinBuildState *state;
1092 state = palloc(sizeof(BrinBuildState));
1094 state->bs_irel = idxRel;
1095 state->bs_numtuples = 0;
1096 state->bs_currentInsertBuf = InvalidBuffer;
1097 state->bs_pagesPerRange = pagesPerRange;
1098 state->bs_currRangeStart = 0;
1099 state->bs_rmAccess = revmap;
1100 state->bs_bdesc = brin_build_desc(idxRel);
1101 state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
1103 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1109 * Release resources associated with a BrinBuildState.
1112 terminate_brin_buildstate(BrinBuildState *state)
1114 /* release the last index buffer used */
1115 if (!BufferIsInvalid(state->bs_currentInsertBuf))
1119 page = BufferGetPage(state->bs_currentInsertBuf);
1120 RecordPageWithFreeSpace(state->bs_irel,
1121 BufferGetBlockNumber(state->bs_currentInsertBuf),
1122 PageGetFreeSpace(page));
1123 ReleaseBuffer(state->bs_currentInsertBuf);
1126 brin_free_desc(state->bs_bdesc);
1127 pfree(state->bs_dtuple);
1132 * On the given BRIN index, summarize the heap page range that corresponds
1133 * to the heap block number given.
1135 * This routine can run in parallel with insertions into the heap. To avoid
1136 * missing those values from the summary tuple, we first insert a placeholder
1137 * index tuple into the index, then execute the heap scan; transactions
1138 * concurrent with the scan update the placeholder tuple. After the scan, we
1139 * union the placeholder tuple with the one computed by this routine. The
1140 * update of the index value happens in a loop, so that if somebody updates
1141 * the placeholder tuple after we read it, we detect the case and try again.
1142 * This ensures that the concurrently inserted tuples are not lost.
1144 * A further corner case is this routine being asked to summarize the partial
1145 * range at the end of the table. heapNumBlocks is the (possibly outdated)
1146 * table size; if we notice that the requested range lies beyond that size,
1147 * we re-compute the table size after inserting the placeholder tuple, to
1148 * avoid missing pages that were appended recently.
1151 summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
1152 BlockNumber heapBlk, BlockNumber heapNumBlks)
1157 OffsetNumber offset;
1158 BlockNumber scanNumBlks;
1161 * Insert the placeholder tuple
1163 phbuf = InvalidBuffer;
1164 phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1165 offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1166 state->bs_rmAccess, &phbuf,
1167 heapBlk, phtup, phsz);
1170 * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1171 * cannot shrink concurrently (but it can grow).
1173 Assert(heapBlk % state->bs_pagesPerRange == 0);
1174 if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1177 * If we're asked to scan what we believe to be the final range on the
1178 * table (i.e. a range that might be partial) we need to recompute our
1179 * idea of what the latest page is after inserting the placeholder
1180 * tuple. Anyone that grows the table later will update the
1181 * placeholder tuple, so it doesn't matter that we won't scan these
1182 * pages ourselves. Careful: the table might have been extended
1183 * beyond the current range, so clamp our result.
1185 * Fortunately, this should occur infrequently.
1187 scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1188 state->bs_pagesPerRange);
1192 /* Easy case: range is known to be complete */
1193 scanNumBlks = state->bs_pagesPerRange;
1197 * Execute the partial heap scan covering the heap blocks in the specified
1198 * page range, summarizing the heap tuples in it. This scan stops just
1199 * short of brinbuildCallback creating the new index entry.
1201 * Note that it is critical we use the "any visible" mode of
1202 * IndexBuildHeapRangeScan here: otherwise, we would miss tuples inserted
1203 * by transactions that are still in progress, among other corner cases.
1205 state->bs_currRangeStart = heapBlk;
1206 IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, true,
1207 heapBlk, scanNumBlks,
1208 brinbuildCallback, (void *) state, NULL);
1211 * Now we update the values obtained by the scan with the placeholder
1212 * tuple. We do this in a loop which only terminates if we're able to
1213 * update the placeholder tuple successfully; if we are not, this means
1214 * somebody else modified the placeholder tuple after we read it.
1223 CHECK_FOR_INTERRUPTS();
1226 * Update the summary tuple and try to update.
1228 newtup = brin_form_tuple(state->bs_bdesc,
1229 heapBlk, state->bs_dtuple, &newsize);
1230 samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1232 brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1233 state->bs_rmAccess, heapBlk, phbuf, offset,
1234 phtup, phsz, newtup, newsize, samepage);
1235 brin_free_tuple(phtup);
1236 brin_free_tuple(newtup);
1238 /* If the update succeeded, we're done. */
1243 * If the update didn't work, it might be because somebody updated the
1244 * placeholder tuple concurrently. Extract the new version, union it
1245 * with the values we have from the scan, and start over. (There are
1246 * other reasons for the update to fail, but it's simple to treat them
1249 phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1250 &offset, &phsz, BUFFER_LOCK_SHARE,
1252 /* the placeholder tuple must exist */
1254 elog(ERROR, "missing placeholder tuple");
1255 phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
1256 LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
1258 /* merge it into the tuple from the heap scan */
1259 union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1262 ReleaseBuffer(phbuf);
1266 * Summarize page ranges that are not already summarized. If pageRange is
1267 * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1268 * page range containing the given heap page number is scanned.
1269 * If include_partial is true, then the partial range at the end of the table
1270 * is summarized, otherwise not.
1272 * For each new index tuple inserted, *numSummarized (if not NULL) is
1273 * incremented; for each existing tuple, *numExisting (if not NULL) is
1277 brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
1278 bool include_partial, double *numSummarized, double *numExisting)
1281 BrinBuildState *state = NULL;
1282 IndexInfo *indexInfo = NULL;
1283 BlockNumber heapNumBlocks;
1284 BlockNumber pagesPerRange;
1286 BlockNumber startBlk;
1288 revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
1290 /* determine range of pages to process */
1291 heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
1292 if (pageRange == BRIN_ALL_BLOCKRANGES)
1296 startBlk = (pageRange / pagesPerRange) * pagesPerRange;
1297 heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1299 if (startBlk > heapNumBlocks)
1301 /* Nothing to do if start point is beyond end of table */
1302 brinRevmapTerminate(revmap);
1307 * Scan the revmap to find unsummarized items.
1309 buf = InvalidBuffer;
1310 for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1316 * Unless requested to summarize even a partial range, go away now if
1317 * we think the next range is partial. Caller would pass true when it
1318 * is typically run once bulk data loading is done
1319 * (brin_summarize_new_values), and false when it is typically the
1320 * result of arbitrarily-scheduled maintenance command (vacuuming).
1322 if (!include_partial &&
1323 (startBlk + pagesPerRange > heapNumBlocks))
1326 CHECK_FOR_INTERRUPTS();
1328 tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1329 BUFFER_LOCK_SHARE, NULL);
1332 /* no revmap entry for this heap range. Summarize it. */
1335 /* first time through */
1337 state = initialize_brin_buildstate(index, revmap,
1339 indexInfo = BuildIndexInfo(index);
1341 summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1343 /* and re-initialize state for the next range */
1344 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1347 *numSummarized += 1.0;
1352 *numExisting += 1.0;
1353 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1357 if (BufferIsValid(buf))
1360 /* free resources */
1361 brinRevmapTerminate(revmap);
1364 terminate_brin_buildstate(state);
1370 * Given a deformed tuple in the build state, convert it into the on-disk
1371 * format and insert it into the index, making the revmap point to it.
1374 form_and_insert_tuple(BrinBuildState *state)
1379 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1380 state->bs_dtuple, &size);
1381 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1382 &state->bs_currentInsertBuf, state->bs_currRangeStart,
1384 state->bs_numtuples++;
1390 * Given two deformed tuples, adjust the first one so that it's consistent
1391 * with the summary values in both.
1394 union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
1399 MemoryContext oldcxt;
1401 /* Use our own memory context to avoid retail pfree */
1402 cxt = AllocSetContextCreate(CurrentMemoryContext,
1404 ALLOCSET_DEFAULT_SIZES);
1405 oldcxt = MemoryContextSwitchTo(cxt);
1406 db = brin_deform_tuple(bdesc, b, NULL);
1407 MemoryContextSwitchTo(oldcxt);
1409 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
1412 BrinValues *col_a = &a->bt_columns[keyno];
1413 BrinValues *col_b = &db->bt_columns[keyno];
1415 unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
1416 BRIN_PROCNUM_UNION);
1417 FunctionCall3Coll(unionFn,
1418 bdesc->bd_index->rd_indcollation[keyno],
1419 PointerGetDatum(bdesc),
1420 PointerGetDatum(col_a),
1421 PointerGetDatum(col_b));
1424 MemoryContextDelete(cxt);
1429 * Do a complete scan of the index during VACUUM.
1431 * This routine scans the complete index looking for uncatalogued index pages,
1432 * i.e. those that might have been lost due to a crash after index extension
1436 brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
1438 bool vacuum_fsm = false;
1442 * Scan the index in physical order, and clean up any possible mess in
1445 for (blkno = 0; blkno < RelationGetNumberOfBlocks(idxrel); blkno++)
1449 CHECK_FOR_INTERRUPTS();
1451 buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
1452 RBM_NORMAL, strategy);
1454 vacuum_fsm |= brin_page_cleanup(idxrel, buf);
1460 * If we made any change to the FSM, make sure the new info is visible all
1461 * the way to the top.
1464 FreeSpaceMapVacuum(idxrel);