1 /*-------------------------------------------------------------------------
4 * Implementation of Lehman and Yao's btree management algorithm for
8 * This file contains only the public interface routines.
11 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
15 * src/backend/access/nbtree/nbtree.c
17 *-------------------------------------------------------------------------
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "nodes/execnodes.h"
29 #include "postmaster/autovacuum.h"
30 #include "storage/condition_variable.h"
31 #include "storage/indexfsm.h"
32 #include "storage/ipc.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 #include "utils/builtins.h"
36 #include "utils/index_selfuncs.h"
37 #include "utils/memutils.h"
40 /* Working state needed by btvacuumpage */
43 IndexVacuumInfo *info;
44 IndexBulkDeleteResult *stats;
45 IndexBulkDeleteCallback callback;
48 BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
49 BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
50 BlockNumber totFreePages; /* true total # of free pages */
51 TransactionId oldestBtpoXact;
52 MemoryContext pagedelcontext;
56 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
58 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
59 * a new page; others must wait.
61 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
62 * to a new page; some process can start doing that.
64 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
65 * We reach this state once for every distinct combination of array keys.
69 BTPARALLEL_NOT_INITIALIZED,
76 * BTParallelScanDescData contains btree specific shared information required
79 typedef struct BTParallelScanDescData
81 BlockNumber btps_scanPage; /* latest or next page to be scanned */
82 BTPS_State btps_pageStatus; /* indicates whether next page is
83 * available for scan. see above for
84 * possible states of parallel scan. */
85 int btps_arrayKeyCount; /* count indicating number of array scan
86 * keys processed by parallel scan */
87 slock_t btps_mutex; /* protects above variables */
88 ConditionVariable btps_cv; /* used to synchronize parallel scan */
89 } BTParallelScanDescData;
91 typedef struct BTParallelScanDescData *BTParallelScanDesc;
94 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
95 IndexBulkDeleteCallback callback, void *callback_state,
96 BTCycleId cycleid, TransactionId *oldestBtpoXact);
97 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
98 BlockNumber orig_blkno);
102 * Btree handler function: return IndexAmRoutine with access method parameters
106 bthandler(PG_FUNCTION_ARGS)
108 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
110 amroutine->amstrategies = BTMaxStrategyNumber;
111 amroutine->amsupport = BTNProcs;
112 amroutine->amcanorder = true;
113 amroutine->amcanorderbyop = false;
114 amroutine->amcanbackward = true;
115 amroutine->amcanunique = true;
116 amroutine->amcanmulticol = true;
117 amroutine->amoptionalkey = true;
118 amroutine->amsearcharray = true;
119 amroutine->amsearchnulls = true;
120 amroutine->amstorage = false;
121 amroutine->amclusterable = true;
122 amroutine->ampredlocks = true;
123 amroutine->amcanparallel = true;
124 amroutine->amcaninclude = true;
125 amroutine->amkeytype = InvalidOid;
127 amroutine->ambuild = btbuild;
128 amroutine->ambuildempty = btbuildempty;
129 amroutine->aminsert = btinsert;
130 amroutine->ambulkdelete = btbulkdelete;
131 amroutine->amvacuumcleanup = btvacuumcleanup;
132 amroutine->amcanreturn = btcanreturn;
133 amroutine->amcostestimate = btcostestimate;
134 amroutine->amoptions = btoptions;
135 amroutine->amproperty = btproperty;
136 amroutine->amvalidate = btvalidate;
137 amroutine->ambeginscan = btbeginscan;
138 amroutine->amrescan = btrescan;
139 amroutine->amgettuple = btgettuple;
140 amroutine->amgetbitmap = btgetbitmap;
141 amroutine->amendscan = btendscan;
142 amroutine->ammarkpos = btmarkpos;
143 amroutine->amrestrpos = btrestrpos;
144 amroutine->amestimateparallelscan = btestimateparallelscan;
145 amroutine->aminitparallelscan = btinitparallelscan;
146 amroutine->amparallelrescan = btparallelrescan;
148 PG_RETURN_POINTER(amroutine);
152 * btbuildempty() -- build an empty btree index in the initialization fork
155 btbuildempty(Relation index)
159 /* Construct metapage. */
160 metapage = (Page) palloc(BLCKSZ);
161 _bt_initmetapage(metapage, P_NONE, 0);
164 * Write the page and log it. It might seem that an immediate sync would
165 * be sufficient to guarantee that the file exists on disk, but recovery
166 * itself might remove it while replaying, for example, an
167 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
168 * this even when wal_level=minimal.
170 PageSetChecksumInplace(metapage, BTREE_METAPAGE);
171 smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
172 (char *) metapage, true);
173 log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
174 BTREE_METAPAGE, metapage, true);
177 * An immediate sync is required even if we xlog'd the page, because the
178 * write did not go through shared_buffers and therefore a concurrent
179 * checkpoint may have moved the redo pointer past our xlog record.
181 smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
185 * btinsert() -- insert an index tuple into a btree.
187 * Descend the tree recursively, find the appropriate location for our
188 * new tuple, and put it there.
191 btinsert(Relation rel, Datum *values, bool *isnull,
192 ItemPointer ht_ctid, Relation heapRel,
193 IndexUniqueCheck checkUnique,
194 IndexInfo *indexInfo)
199 /* generate an index tuple */
200 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
201 itup->t_tid = *ht_ctid;
203 result = _bt_doinsert(rel, itup, checkUnique, heapRel);
211 * btgettuple() -- Get the next tuple in the scan.
214 btgettuple(IndexScanDesc scan, ScanDirection dir)
216 BTScanOpaque so = (BTScanOpaque) scan->opaque;
219 /* btree indexes are never lossy */
220 scan->xs_recheck = false;
223 * If we have any array keys, initialize them during first call for a
224 * scan. We can't do this in btrescan because we don't know the scan
225 * direction at that time.
227 if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
229 /* punt if we have any unsatisfiable array keys */
230 if (so->numArrayKeys < 0)
233 _bt_start_array_keys(scan, dir);
236 /* This loop handles advancing to the next array elements, if any */
240 * If we've already initialized this scan, we can just advance it in
241 * the appropriate direction. If we haven't done so yet, we call
242 * _bt_first() to get the first item in the scan.
244 if (!BTScanPosIsValid(so->currPos))
245 res = _bt_first(scan, dir);
249 * Check to see if we should kill the previously-fetched tuple.
251 if (scan->kill_prior_tuple)
254 * Yes, remember it for later. (We'll deal with all such
255 * tuples at once right before leaving the index page.) The
256 * test for numKilled overrun is not just paranoia: if the
257 * caller reverses direction in the indexscan then the same
258 * item might get entered multiple times. It's not worth
259 * trying to optimize that, so we don't detect it, but instead
260 * just forget any excess entries.
262 if (so->killedItems == NULL)
263 so->killedItems = (int *)
264 palloc(MaxIndexTuplesPerPage * sizeof(int));
265 if (so->numKilled < MaxIndexTuplesPerPage)
266 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 * Now continue the scan.
272 res = _bt_next(scan, dir);
275 /* If we have a tuple, return it ... */
278 /* ... otherwise see if we have more array keys to deal with */
279 } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
285 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
290 BTScanOpaque so = (BTScanOpaque) scan->opaque;
295 * If we have any array keys, initialize them.
297 if (so->numArrayKeys)
299 /* punt if we have any unsatisfiable array keys */
300 if (so->numArrayKeys < 0)
303 _bt_start_array_keys(scan, ForwardScanDirection);
306 /* This loop handles advancing to the next array elements, if any */
309 /* Fetch the first page & tuple */
310 if (_bt_first(scan, ForwardScanDirection))
312 /* Save tuple ID, and continue scanning */
313 heapTid = &scan->xs_heaptid;
314 tbm_add_tuples(tbm, heapTid, 1, false);
320 * Advance to next tuple within page. This is the same as the
321 * easy case in _bt_next().
323 if (++so->currPos.itemIndex > so->currPos.lastItem)
325 /* let _bt_next do the heavy lifting */
326 if (!_bt_next(scan, ForwardScanDirection))
330 /* Save tuple ID, and continue scanning */
331 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
332 tbm_add_tuples(tbm, heapTid, 1, false);
336 /* Now see if we have more array keys to deal with */
337 } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
343 * btbeginscan() -- start a scan on a btree index
346 btbeginscan(Relation rel, int nkeys, int norderbys)
351 /* no order by operators allowed */
352 Assert(norderbys == 0);
355 scan = RelationGetIndexScan(rel, nkeys, norderbys);
357 /* allocate private workspace */
358 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
359 BTScanPosInvalidate(so->currPos);
360 BTScanPosInvalidate(so->markPos);
361 if (scan->numberOfKeys > 0)
362 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
366 so->arrayKeyData = NULL; /* assume no array keys for now */
367 so->numArrayKeys = 0;
368 so->arrayKeys = NULL;
369 so->arrayContext = NULL;
371 so->killedItems = NULL; /* until needed */
375 * We don't know yet whether the scan will be index-only, so we do not
376 * allocate the tuple workspace arrays until btrescan. However, we set up
377 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
379 so->currTuples = so->markTuples = NULL;
381 scan->xs_itupdesc = RelationGetDescr(rel);
389 * btrescan() -- rescan an index relation
392 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393 ScanKey orderbys, int norderbys)
395 BTScanOpaque so = (BTScanOpaque) scan->opaque;
397 /* we aren't holding any read locks, but gotta drop the pins */
398 if (BTScanPosIsValid(so->currPos))
400 /* Before leaving current page, deal with any killed items */
401 if (so->numKilled > 0)
403 BTScanPosUnpinIfPinned(so->currPos);
404 BTScanPosInvalidate(so->currPos);
407 so->markItemIndex = -1;
408 so->arrayKeyCount = 0;
409 BTScanPosUnpinIfPinned(so->markPos);
410 BTScanPosInvalidate(so->markPos);
413 * Allocate tuple workspace arrays, if needed for an index-only scan and
414 * not already done in a previous rescan call. To save on palloc
415 * overhead, both workspaces are allocated as one palloc block; only this
416 * function and btendscan know that.
418 * NOTE: this data structure also makes it safe to return data from a
419 * "name" column, even though btree name_ops uses an underlying storage
420 * datatype of cstring. The risk there is that "name" is supposed to be
421 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
422 * However, since we only return data out of tuples sitting in the
423 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
424 * data out of the markTuples array --- running off the end of memory for
425 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
426 * adding special-case treatment for name_ops elsewhere.
428 if (scan->xs_want_itup && so->currTuples == NULL)
430 so->currTuples = (char *) palloc(BLCKSZ * 2);
431 so->markTuples = so->currTuples + BLCKSZ;
435 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
438 if (scankey && scan->numberOfKeys > 0)
439 memmove(scan->keyData,
441 scan->numberOfKeys * sizeof(ScanKeyData));
442 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
444 /* If any keys are SK_SEARCHARRAY type, set up array-key info */
445 _bt_preprocess_array_keys(scan);
449 * btendscan() -- close down a scan
452 btendscan(IndexScanDesc scan)
454 BTScanOpaque so = (BTScanOpaque) scan->opaque;
456 /* we aren't holding any read locks, but gotta drop the pins */
457 if (BTScanPosIsValid(so->currPos))
459 /* Before leaving current page, deal with any killed items */
460 if (so->numKilled > 0)
462 BTScanPosUnpinIfPinned(so->currPos);
465 so->markItemIndex = -1;
466 BTScanPosUnpinIfPinned(so->markPos);
468 /* No need to invalidate positions, the RAM is about to be freed. */
470 /* Release storage */
471 if (so->keyData != NULL)
473 /* so->arrayKeyData and so->arrayKeys are in arrayContext */
474 if (so->arrayContext != NULL)
475 MemoryContextDelete(so->arrayContext);
476 if (so->killedItems != NULL)
477 pfree(so->killedItems);
478 if (so->currTuples != NULL)
479 pfree(so->currTuples);
480 /* so->markTuples should not be pfree'd, see btrescan */
485 * btmarkpos() -- save current scan position
488 btmarkpos(IndexScanDesc scan)
490 BTScanOpaque so = (BTScanOpaque) scan->opaque;
492 /* There may be an old mark with a pin (but no lock). */
493 BTScanPosUnpinIfPinned(so->markPos);
496 * Just record the current itemIndex. If we later step to next page
497 * before releasing the marked position, _bt_steppage makes a full copy of
498 * the currPos struct in markPos. If (as often happens) the mark is moved
499 * before we leave the page, we don't have to do that work.
501 if (BTScanPosIsValid(so->currPos))
502 so->markItemIndex = so->currPos.itemIndex;
505 BTScanPosInvalidate(so->markPos);
506 so->markItemIndex = -1;
509 /* Also record the current positions of any array keys */
510 if (so->numArrayKeys)
511 _bt_mark_array_keys(scan);
515 * btrestrpos() -- restore scan to last saved position
518 btrestrpos(IndexScanDesc scan)
520 BTScanOpaque so = (BTScanOpaque) scan->opaque;
522 /* Restore the marked positions of any array keys */
523 if (so->numArrayKeys)
524 _bt_restore_array_keys(scan);
526 if (so->markItemIndex >= 0)
529 * The scan has never moved to a new page since the last mark. Just
530 * restore the itemIndex.
532 * NB: In this case we can't count on anything in so->markPos to be
535 so->currPos.itemIndex = so->markItemIndex;
540 * The scan moved to a new page after last mark or restore, and we are
541 * now restoring to the marked page. We aren't holding any read
542 * locks, but if we're still holding the pin for the current position,
545 if (BTScanPosIsValid(so->currPos))
547 /* Before leaving current page, deal with any killed items */
548 if (so->numKilled > 0)
550 BTScanPosUnpinIfPinned(so->currPos);
553 if (BTScanPosIsValid(so->markPos))
555 /* bump pin on mark buffer for assignment to current buffer */
556 if (BTScanPosIsPinned(so->markPos))
557 IncrBufferRefCount(so->markPos.buf);
558 memcpy(&so->currPos, &so->markPos,
559 offsetof(BTScanPosData, items[1]) +
560 so->markPos.lastItem * sizeof(BTScanPosItem));
562 memcpy(so->currTuples, so->markTuples,
563 so->markPos.nextTupleOffset);
566 BTScanPosInvalidate(so->currPos);
571 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
574 btestimateparallelscan(void)
576 return sizeof(BTParallelScanDescData);
580 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
583 btinitparallelscan(void *target)
585 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
587 SpinLockInit(&bt_target->btps_mutex);
588 bt_target->btps_scanPage = InvalidBlockNumber;
589 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
590 bt_target->btps_arrayKeyCount = 0;
591 ConditionVariableInit(&bt_target->btps_cv);
595 * btparallelrescan() -- reset parallel scan
598 btparallelrescan(IndexScanDesc scan)
600 BTParallelScanDesc btscan;
601 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
603 Assert(parallel_scan);
605 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
606 parallel_scan->ps_offset);
609 * In theory, we don't need to acquire the spinlock here, because there
610 * shouldn't be any other workers running at this point, but we do so for
613 SpinLockAcquire(&btscan->btps_mutex);
614 btscan->btps_scanPage = InvalidBlockNumber;
615 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
616 btscan->btps_arrayKeyCount = 0;
617 SpinLockRelease(&btscan->btps_mutex);
621 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
622 * page. Other scans must wait until we call bt_parallel_release() or
623 * bt_parallel_done().
625 * The return value is true if we successfully seized the scan and false
626 * if we did not. The latter case occurs if no pages remain for the current
629 * If the return value is true, *pageno returns the next or current page
630 * of the scan (depending on the scan direction). An invalid block number
631 * means the scan hasn't yet started, and P_NONE means we've reached the end.
632 * The first time a participating process reaches the last page, it will return
633 * true and set *pageno to P_NONE; after that, further attempts to seize the
634 * scan will return false.
636 * Callers should ignore the value of pageno if the return value is false.
639 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
641 BTScanOpaque so = (BTScanOpaque) scan->opaque;
642 BTPS_State pageStatus;
643 bool exit_loop = false;
645 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
646 BTParallelScanDesc btscan;
650 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
651 parallel_scan->ps_offset);
655 SpinLockAcquire(&btscan->btps_mutex);
656 pageStatus = btscan->btps_pageStatus;
658 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
660 /* Parallel scan has already advanced to a new set of scankeys. */
663 else if (pageStatus == BTPARALLEL_DONE)
666 * We're done with this set of scankeys. This may be the end, or
667 * there could be more sets to try.
671 else if (pageStatus != BTPARALLEL_ADVANCING)
674 * We have successfully seized control of the scan for the purpose
675 * of advancing it to a new page!
677 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
678 *pageno = btscan->btps_scanPage;
681 SpinLockRelease(&btscan->btps_mutex);
682 if (exit_loop || !status)
684 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
686 ConditionVariableCancelSleep();
692 * _bt_parallel_release() -- Complete the process of advancing the scan to a
693 * new page. We now have the new value btps_scanPage; some other backend
694 * can now begin advancing the scan.
697 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
699 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
700 BTParallelScanDesc btscan;
702 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
703 parallel_scan->ps_offset);
705 SpinLockAcquire(&btscan->btps_mutex);
706 btscan->btps_scanPage = scan_page;
707 btscan->btps_pageStatus = BTPARALLEL_IDLE;
708 SpinLockRelease(&btscan->btps_mutex);
709 ConditionVariableSignal(&btscan->btps_cv);
713 * _bt_parallel_done() -- Mark the parallel scan as complete.
715 * When there are no pages left to scan, this function should be called to
716 * notify other workers. Otherwise, they might wait forever for the scan to
717 * advance to the next page.
720 _bt_parallel_done(IndexScanDesc scan)
722 BTScanOpaque so = (BTScanOpaque) scan->opaque;
723 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
724 BTParallelScanDesc btscan;
725 bool status_changed = false;
727 /* Do nothing, for non-parallel scans */
728 if (parallel_scan == NULL)
731 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
732 parallel_scan->ps_offset);
735 * Mark the parallel scan as done for this combination of scan keys,
736 * unless some other process already did so. See also
737 * _bt_advance_array_keys.
739 SpinLockAcquire(&btscan->btps_mutex);
740 if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
741 btscan->btps_pageStatus != BTPARALLEL_DONE)
743 btscan->btps_pageStatus = BTPARALLEL_DONE;
744 status_changed = true;
746 SpinLockRelease(&btscan->btps_mutex);
748 /* wake up all the workers associated with this parallel scan */
750 ConditionVariableBroadcast(&btscan->btps_cv);
754 * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
757 * Updates the count of array keys processed for both local and parallel
761 _bt_parallel_advance_array_keys(IndexScanDesc scan)
763 BTScanOpaque so = (BTScanOpaque) scan->opaque;
764 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
765 BTParallelScanDesc btscan;
767 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
768 parallel_scan->ps_offset);
771 SpinLockAcquire(&btscan->btps_mutex);
772 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
774 btscan->btps_scanPage = InvalidBlockNumber;
775 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
776 btscan->btps_arrayKeyCount++;
778 SpinLockRelease(&btscan->btps_mutex);
782 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
783 * btbulkdelete() wasn't called.
786 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
790 BTMetaPageData *metad;
793 metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
794 metapg = BufferGetPage(metabuf);
795 metad = BTPageGetMeta(metapg);
797 if (metad->btm_version < BTREE_NOVAC_VERSION)
800 * Do cleanup if metapage needs upgrade, because we don't have
801 * cleanup-related meta-information yet.
805 else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
806 TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
810 * If oldest btpo.xact in the deleted pages is older than
811 * RecentGlobalXmin, then at least one deleted page can be recycled.
817 StdRdOptions *relopts;
818 float8 cleanup_scale_factor;
819 float8 prev_num_heap_tuples;
822 * If table receives enough insertions and no cleanup was performed,
823 * then index would appear have stale statistics. If scale factor is
824 * set, we avoid that by performing cleanup if the number of inserted
825 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
826 * original tuples count.
828 relopts = (StdRdOptions *) info->index->rd_options;
829 cleanup_scale_factor = (relopts &&
830 relopts->vacuum_cleanup_index_scale_factor >= 0)
831 ? relopts->vacuum_cleanup_index_scale_factor
832 : vacuum_cleanup_index_scale_factor;
833 prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
835 if (cleanup_scale_factor <= 0 ||
836 prev_num_heap_tuples < 0 ||
837 (info->num_heap_tuples - prev_num_heap_tuples) /
838 prev_num_heap_tuples >= cleanup_scale_factor)
842 _bt_relbuf(info->index, metabuf);
847 * Bulk deletion of all index entries pointing to a set of heap tuples.
848 * The set of target tuples is specified via a callback routine that tells
849 * whether any given heap tuple (identified by ItemPointer) is being deleted.
851 * Result: a palloc'd struct containing statistical info for VACUUM displays.
853 IndexBulkDeleteResult *
854 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
855 IndexBulkDeleteCallback callback, void *callback_state)
857 Relation rel = info->index;
860 /* allocate stats if first time through, else re-use existing struct */
862 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
864 /* Establish the vacuum cycle ID to use for this scan */
865 /* The ENSURE stuff ensures we clean up shared memory on failure */
866 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
868 TransactionId oldestBtpoXact;
870 cycleid = _bt_start_vacuum(rel);
872 btvacuumscan(info, stats, callback, callback_state, cycleid,
876 * Update cleanup-related information in metapage. This information is
877 * used only for cleanup but keeping them up to date can avoid
878 * unnecessary cleanup even after bulkdelete.
880 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
881 info->num_heap_tuples);
883 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
890 * Post-VACUUM cleanup.
892 * Result: a palloc'd struct containing statistical info for VACUUM displays.
894 IndexBulkDeleteResult *
895 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
897 /* No-op in ANALYZE ONLY mode */
898 if (info->analyze_only)
902 * If btbulkdelete was called, we need not do anything, just return the
903 * stats from the latest btbulkdelete call. If it wasn't called, we might
904 * still need to do a pass over the index, to recycle any newly-recyclable
905 * pages or to obtain index statistics. _bt_vacuum_needs_cleanup
906 * determines if either are needed.
908 * Since we aren't going to actually delete any leaf items, there's no
909 * need to go through all the vacuum-cycle-ID pushups.
913 TransactionId oldestBtpoXact;
915 /* Check if we need a cleanup */
916 if (!_bt_vacuum_needs_cleanup(info))
919 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
920 btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
922 /* Update cleanup-related information in the metapage */
923 _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
924 info->num_heap_tuples);
928 * It's quite possible for us to be fooled by concurrent page splits into
929 * double-counting some index tuples, so disbelieve any total that exceeds
930 * the underlying heap's count ... if we know that accurately. Otherwise
931 * this might just make matters worse.
933 if (!info->estimated_count)
935 if (stats->num_index_tuples > info->num_heap_tuples)
936 stats->num_index_tuples = info->num_heap_tuples;
943 * btvacuumscan --- scan the index for VACUUMing purposes
945 * This combines the functions of looking for leaf tuples that are deletable
946 * according to the vacuum callback, looking for empty pages that can be
947 * deleted, and looking for old deleted pages that can be recycled. Both
948 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
949 * btbulkdelete call occurred).
951 * The caller is responsible for initially allocating/zeroing a stats struct
952 * and for obtaining a vacuum cycle ID if necessary.
955 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
956 IndexBulkDeleteCallback callback, void *callback_state,
957 BTCycleId cycleid, TransactionId *oldestBtpoXact)
959 Relation rel = info->index;
961 BlockNumber num_pages;
966 * Reset counts that will be incremented during the scan; needed in case
967 * of multiple scans during a single VACUUM command
969 stats->estimated_count = false;
970 stats->num_index_tuples = 0;
971 stats->pages_deleted = 0;
973 /* Set up info to pass down to btvacuumpage */
975 vstate.stats = stats;
976 vstate.callback = callback;
977 vstate.callback_state = callback_state;
978 vstate.cycleid = cycleid;
979 vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
980 vstate.lastBlockLocked = BTREE_METAPAGE;
981 vstate.totFreePages = 0;
982 vstate.oldestBtpoXact = InvalidTransactionId;
984 /* Create a temporary memory context to run _bt_pagedel in */
985 vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
987 ALLOCSET_DEFAULT_SIZES);
990 * The outer loop iterates over all index pages except the metapage, in
991 * physical order (we hope the kernel will cooperate in providing
992 * read-ahead for speed). It is critical that we visit all leaf pages,
993 * including ones added after we start the scan, else we might fail to
994 * delete some deletable tuples. Hence, we must repeatedly check the
995 * relation length. We must acquire the relation-extension lock while
996 * doing so to avoid a race condition: if someone else is extending the
997 * relation, there is a window where bufmgr/smgr have created a new
998 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
999 * we manage to scan such a page here, we'll improperly assume it can be
1000 * recycled. Taking the lock synchronizes things enough to prevent a
1001 * problem: either num_pages won't include the new page, or _bt_getbuf
1002 * already has write lock on the buffer and it will be fully initialized
1003 * before we can examine it. (See also vacuumlazy.c, which has the same
1004 * issue.) Also, we need not worry if a page is added immediately after
1005 * we look; the page splitting code already has write-lock on the left
1006 * page before it adds a right page, so we must already have processed any
1007 * tuples due to be moved into such a page.
1009 * We can skip locking for new or temp relations, however, since no one
1010 * else could be accessing them.
1012 needLock = !RELATION_IS_LOCAL(rel);
1014 blkno = BTREE_METAPAGE + 1;
1017 /* Get the current relation length */
1019 LockRelationForExtension(rel, ExclusiveLock);
1020 num_pages = RelationGetNumberOfBlocks(rel);
1022 UnlockRelationForExtension(rel, ExclusiveLock);
1024 /* Quit if we've scanned the whole relation */
1025 if (blkno >= num_pages)
1027 /* Iterate over pages, then loop back to recheck length */
1028 for (; blkno < num_pages; blkno++)
1030 btvacuumpage(&vstate, blkno, blkno);
1035 * Check to see if we need to issue one final WAL record for this index,
1036 * which may be needed for correctness on a hot standby node when non-MVCC
1037 * index scans could take place.
1039 * If the WAL is replayed in hot standby, the replay process needs to get
1040 * cleanup locks on all index leaf pages, just as we've been doing here.
1041 * However, we won't issue any WAL records about pages that have no items
1042 * to be deleted. For pages between pages we've vacuumed, the replay code
1043 * will take locks under the direction of the lastBlockVacuumed fields in
1044 * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1045 * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1046 * against the last leaf page in the index, if that one wasn't vacuumed.
1048 if (XLogStandbyInfoActive() &&
1049 vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1054 * The page should be valid, but we can't use _bt_getbuf() because we
1055 * want to use a nondefault buffer access strategy. Since we aren't
1056 * going to delete any items, getting cleanup lock again is probably
1057 * overkill, but for consistency do that anyway.
1059 buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1060 RBM_NORMAL, info->strategy);
1061 LockBufferForCleanup(buf);
1062 _bt_checkpage(rel, buf);
1063 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1064 _bt_relbuf(rel, buf);
1067 MemoryContextDelete(vstate.pagedelcontext);
1070 * If we found any recyclable pages (and recorded them in the FSM), then
1071 * forcibly update the upper-level FSM pages to ensure that searchers can
1072 * find them. It's possible that the pages were also found during
1073 * previous scans and so this is a waste of time, but it's cheap enough
1074 * relative to scanning the index that it shouldn't matter much, and
1075 * making sure that free pages are available sooner not later seems
1078 * Note that if no recyclable pages exist, we don't bother vacuuming the
1081 if (vstate.totFreePages > 0)
1082 IndexFreeSpaceMapVacuum(rel);
1084 /* update statistics */
1085 stats->num_pages = num_pages;
1086 stats->pages_free = vstate.totFreePages;
1089 *oldestBtpoXact = vstate.oldestBtpoXact;
1093 * btvacuumpage --- VACUUM one page
1095 * This processes a single page for btvacuumscan(). In some cases we
1096 * must go back and re-examine previously-scanned pages; this routine
1097 * recurses when necessary to handle that case.
1099 * blkno is the page to process. orig_blkno is the highest block number
1100 * reached by the outer btvacuumscan loop (the same as blkno, unless we
1101 * are recursing to re-examine a previous page).
1104 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1106 IndexVacuumInfo *info = vstate->info;
1107 IndexBulkDeleteResult *stats = vstate->stats;
1108 IndexBulkDeleteCallback callback = vstate->callback;
1109 void *callback_state = vstate->callback_state;
1110 Relation rel = info->index;
1112 BlockNumber recurse_to;
1115 BTPageOpaque opaque = NULL;
1119 recurse_to = P_NONE;
1121 /* call vacuum_delay_point while not holding any buffer lock */
1122 vacuum_delay_point();
1125 * We can't use _bt_getbuf() here because it always applies
1126 * _bt_checkpage(), which will barf on an all-zero page. We want to
1127 * recycle all-zero pages, not fail. Also, we want to use a nondefault
1128 * buffer access strategy.
1130 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1132 LockBuffer(buf, BT_READ);
1133 page = BufferGetPage(buf);
1134 if (!PageIsNew(page))
1136 _bt_checkpage(rel, buf);
1137 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1141 * If we are recursing, the only case we want to do anything with is a
1142 * live leaf page having the current vacuum cycle ID. Any other state
1143 * implies we already saw the page (eg, deleted it as being empty).
1145 if (blkno != orig_blkno)
1147 if (_bt_page_recyclable(page) ||
1149 !P_ISLEAF(opaque) ||
1150 opaque->btpo_cycleid != vstate->cycleid)
1152 _bt_relbuf(rel, buf);
1157 /* Page is valid, see what to do with it */
1158 if (_bt_page_recyclable(page))
1160 /* Okay to recycle this page */
1161 RecordFreeIndexPage(rel, blkno);
1162 vstate->totFreePages++;
1163 stats->pages_deleted++;
1165 else if (P_ISDELETED(opaque))
1167 /* Already deleted, but can't recycle yet */
1168 stats->pages_deleted++;
1170 /* Update the oldest btpo.xact */
1171 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1172 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1173 vstate->oldestBtpoXact = opaque->btpo.xact;
1175 else if (P_ISHALFDEAD(opaque))
1177 /* Half-dead, try to delete */
1180 else if (P_ISLEAF(opaque))
1182 OffsetNumber deletable[MaxOffsetNumber];
1184 OffsetNumber offnum,
1189 * Trade in the initial read lock for a super-exclusive write lock on
1190 * this page. We must get such a lock on every leaf page over the
1191 * course of the vacuum scan, whether or not it actually contains any
1192 * deletable tuples --- see nbtree/README.
1194 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1195 LockBufferForCleanup(buf);
1198 * Remember highest leaf page number we've taken cleanup lock on; see
1199 * notes in btvacuumscan
1201 if (blkno > vstate->lastBlockLocked)
1202 vstate->lastBlockLocked = blkno;
1205 * Check whether we need to recurse back to earlier pages. What we
1206 * are concerned about is a page split that happened since we started
1207 * the vacuum scan. If the split moved some tuples to a lower page
1208 * then we might have missed 'em. If so, set up for tail recursion.
1209 * (Must do this before possibly clearing btpo_cycleid below!)
1211 if (vstate->cycleid != 0 &&
1212 opaque->btpo_cycleid == vstate->cycleid &&
1213 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1214 !P_RIGHTMOST(opaque) &&
1215 opaque->btpo_next < orig_blkno)
1216 recurse_to = opaque->btpo_next;
1219 * Scan over all items to see which ones need deleted according to the
1220 * callback function.
1223 minoff = P_FIRSTDATAKEY(opaque);
1224 maxoff = PageGetMaxOffsetNumber(page);
1227 for (offnum = minoff;
1229 offnum = OffsetNumberNext(offnum))
1234 itup = (IndexTuple) PageGetItem(page,
1235 PageGetItemId(page, offnum));
1236 htup = &(itup->t_tid);
1239 * During Hot Standby we currently assume that
1240 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1241 * only true as long as the callback function depends only
1242 * upon whether the index tuple refers to heap tuples removed
1243 * in the initial heap scan. When vacuum starts it derives a
1244 * value of OldestXmin. Backends taking later snapshots could
1245 * have a RecentGlobalXmin with a later xid than the vacuum's
1246 * OldestXmin, so it is possible that row versions deleted
1247 * after OldestXmin could be marked as killed by other
1248 * backends. The callback function *could* look at the index
1249 * tuple state in isolation and decide to delete the index
1250 * tuple, though currently it does not. If it ever did, we
1251 * would need to reconsider whether XLOG_BTREE_VACUUM records
1252 * should cause conflicts. If they did cause conflicts they
1253 * would be fairly harsh conflicts, since we haven't yet
1254 * worked out a way to pass a useful value for
1255 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1256 * applies to *any* type of index that marks index tuples as
1259 if (callback(htup, callback_state))
1260 deletable[ndeletable++] = offnum;
1265 * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1266 * call per page, so as to minimize WAL traffic.
1271 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1272 * all information to the replay code to allow it to get a cleanup
1273 * lock on all pages between the previous lastBlockVacuumed and
1274 * this page. This ensures that WAL replay locks all leaf pages at
1275 * some point, which is important should non-MVCC scans be
1276 * requested. This is currently unused on standby, but we record
1277 * it anyway, so that the WAL contains the required information.
1279 * Since we can visit leaf pages out-of-order when recursing,
1280 * replay might end up locking such pages an extra time, but it
1281 * doesn't seem worth the amount of bookkeeping it'd take to avoid
1284 _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1285 vstate->lastBlockVacuumed);
1288 * Remember highest leaf page number we've issued a
1289 * XLOG_BTREE_VACUUM WAL record for.
1291 if (blkno > vstate->lastBlockVacuumed)
1292 vstate->lastBlockVacuumed = blkno;
1294 stats->tuples_removed += ndeletable;
1295 /* must recompute maxoff */
1296 maxoff = PageGetMaxOffsetNumber(page);
1301 * If the page has been split during this vacuum cycle, it seems
1302 * worth expending a write to clear btpo_cycleid even if we don't
1303 * have any deletions to do. (If we do, _bt_delitems_vacuum takes
1304 * care of this.) This ensures we won't process the page again.
1306 * We treat this like a hint-bit update because there's no need to
1309 if (vstate->cycleid != 0 &&
1310 opaque->btpo_cycleid == vstate->cycleid)
1312 opaque->btpo_cycleid = 0;
1313 MarkBufferDirtyHint(buf, true);
1318 * If it's now empty, try to delete; else count the live tuples. We
1319 * don't delete when recursing, though, to avoid putting entries into
1320 * freePages out-of-order (doesn't seem worth any extra code to handle
1323 if (minoff > maxoff)
1324 delete_now = (blkno == orig_blkno);
1326 stats->num_index_tuples += maxoff - minoff + 1;
1331 MemoryContext oldcontext;
1334 /* Run pagedel in a temp context to avoid memory leakage */
1335 MemoryContextReset(vstate->pagedelcontext);
1336 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1338 ndel = _bt_pagedel(rel, buf);
1340 /* count only this page, else may double-count parent */
1343 stats->pages_deleted++;
1344 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1345 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1346 vstate->oldestBtpoXact = opaque->btpo.xact;
1349 MemoryContextSwitchTo(oldcontext);
1350 /* pagedel released buffer, so we shouldn't */
1353 _bt_relbuf(rel, buf);
1356 * This is really tail recursion, but if the compiler is too stupid to
1357 * optimize it as such, we'd eat an uncomfortably large amount of stack
1358 * space per recursion level (due to the deletable[] array). A failure is
1359 * improbable since the number of levels isn't likely to be large ... but
1360 * just in case, let's hand-optimize into a loop.
1362 if (recurse_to != P_NONE)
1370 * btcanreturn() -- Check whether btree indexes support index-only scans.
1372 * btrees always do, so this is trivial.
1375 btcanreturn(Relation index, int attno)