1 /*-------------------------------------------------------------------------
4 * Concurrent ("lazy") vacuuming.
7 * The major space usage for LAZY VACUUM is storage for the array of dead
8 * tuple TIDs, with the next biggest need being storage for per-disk-page
9 * free space info. We want to ensure we can vacuum even the very largest
10 * relations with finite memory space usage. To do that, we set upper bounds
11 * on the number of tuples and pages we will keep track of at once.
13 * We are willing to use at most maintenance_work_mem memory space to keep
14 * track of dead tuples. We initially allocate an array of TIDs of that size.
15 * If the array threatens to overflow, we suspend the heap scan phase and
16 * perform a pass of index cleanup and page compaction, then resume the heap
17 * scan with an empty TID array.
19 * We can limit the storage for page free space to MaxFSMPages entries,
20 * since that's the most the free space map will be willing to remember
21 * anyway. If the relation has fewer than that many pages with free space,
22 * life is easy: just build an array of per-page info. If it has more,
23 * we store the free space info as a heap ordered by amount of free space,
24 * so that we can discard the pages with least free space to ensure we never
25 * have more than MaxFSMPages entries in all. The surviving page entries
26 * are passed to the free space map at conclusion of the scan.
29 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
30 * Portions Copyright (c) 1994, Regents of the University of California
34 * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.66 2006/02/11 23:31:34 tgl Exp $
36 *-------------------------------------------------------------------------
42 #include "access/genam.h"
43 #include "access/heapam.h"
44 #include "access/xlog.h"
45 #include "commands/vacuum.h"
46 #include "miscadmin.h"
48 #include "storage/freespace.h"
49 #include "storage/smgr.h"
50 #include "utils/lsyscache.h"
51 #include "utils/pg_rusage.h"
55 * Space/time tradeoff parameters: do these need to be user-tunable?
57 * To consider truncating the relation, we want there to be at least
58 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
59 * is less) potentially-freeable pages.
61 #define REL_TRUNCATE_MINIMUM 1000
62 #define REL_TRUNCATE_FRACTION 16
65 typedef struct LVRelStats
67 /* Overall statistics about rel */
68 BlockNumber rel_pages;
70 BlockNumber pages_removed;
71 double tuples_deleted;
72 BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
73 Size threshold; /* minimum interesting free space */
74 /* List of TIDs of tuples we intend to delete */
75 /* NB: this list is ordered by TID address */
76 int num_dead_tuples; /* current # of entries */
77 int max_dead_tuples; /* # slots allocated in array */
78 ItemPointer dead_tuples; /* array of ItemPointerData */
79 /* Array or heap of per-page info about free space */
80 /* We use a simple array until it fills up, then convert to heap */
81 bool fs_is_heap; /* are we using heap organization? */
82 int num_free_pages; /* current # of entries */
83 int max_free_pages; /* # slots allocated in array */
84 PageFreeSpaceInfo *free_pages; /* array or heap of blkno/avail */
88 static int elevel = -1;
90 static TransactionId OldestXmin;
91 static TransactionId FreezeLimit;
94 /* non-export function prototypes */
95 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
96 Relation *Irel, int nindexes);
97 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
98 static void lazy_scan_index(Relation indrel, LVRelStats *vacrelstats);
99 static void lazy_vacuum_index(Relation indrel,
100 double *index_tups_vacuumed,
101 BlockNumber *index_pages_removed,
102 LVRelStats *vacrelstats);
103 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
104 int tupindex, LVRelStats *vacrelstats);
105 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
106 static BlockNumber count_nondeletable_pages(Relation onerel,
107 LVRelStats *vacrelstats);
108 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
109 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
110 ItemPointer itemptr);
111 static void lazy_record_free_space(LVRelStats *vacrelstats,
112 BlockNumber page, Size avail);
113 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
114 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
115 static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats);
116 static int vac_cmp_itemptr(const void *left, const void *right);
117 static int vac_cmp_page_spaces(const void *left, const void *right);
121 * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
123 * This routine vacuums a single heap, cleans out its indexes, and
124 * updates its num_pages and num_tuples statistics.
126 * At entry, we have already established a transaction and opened
127 * and locked the relation.
130 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
132 LVRelStats *vacrelstats;
136 BlockNumber possibly_freeable;
138 if (vacstmt->verbose)
143 vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
144 &OldestXmin, &FreezeLimit);
146 vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
148 /* Set threshold for interesting free space = average request size */
149 /* XXX should we scale it up or down? Adjust vacuum.c too, if so */
150 vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);
152 /* Open all indexes of the relation */
153 vac_open_indexes(onerel, ShareUpdateExclusiveLock, &nindexes, &Irel);
154 hasindex = (nindexes > 0);
156 /* Do the vacuuming */
157 lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
159 /* Done with indexes */
160 vac_close_indexes(nindexes, Irel, NoLock);
163 * Optionally truncate the relation.
165 * Don't even think about it unless we have a shot at releasing a goodly
166 * number of pages. Otherwise, the time taken isn't worth it.
168 possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
169 if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
170 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
171 lazy_truncate_heap(onerel, vacrelstats);
173 /* Update shared free space map with final free space info */
174 lazy_update_fsm(onerel, vacrelstats);
176 /* Update statistics in pg_class */
177 vac_update_relstats(RelationGetRelid(onerel),
178 vacrelstats->rel_pages,
179 vacrelstats->rel_tuples,
182 /* report results to the stats collector, too */
183 pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
184 vacstmt->analyze, vacrelstats->rel_tuples);
189 * lazy_scan_heap() -- scan an open heap relation
191 * This routine sets commit status bits, builds lists of dead tuples
192 * and pages with free space, and calculates statistics on the number
193 * of live tuples in the heap. When done, or when we run low on space
194 * for dead-tuple TIDs, invoke vacuuming of indexes and heap.
197 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
198 Relation *Irel, int nindexes)
204 BlockNumber empty_pages;
209 double *index_tups_vacuumed;
210 BlockNumber *index_pages_removed;
211 bool did_vacuum_index = false;
215 pg_rusage_init(&ru0);
217 relname = RelationGetRelationName(onerel);
219 (errmsg("vacuuming \"%s.%s\"",
220 get_namespace_name(RelationGetNamespace(onerel)),
224 num_tuples = tups_vacuumed = nkeep = nunused = 0;
227 * Because index vacuuming is done in multiple passes, we have to keep
228 * track of the total number of rows and pages removed from each index.
229 * index_tups_vacuumed[i] is the number removed so far from the i'th
230 * index. (For partial indexes this could well be different from
231 * tups_vacuumed.) Likewise for index_pages_removed[i].
233 index_tups_vacuumed = (double *) palloc0(nindexes * sizeof(double));
234 index_pages_removed = (BlockNumber *) palloc0(nindexes * sizeof(BlockNumber));
236 nblocks = RelationGetNumberOfBlocks(onerel);
237 vacrelstats->rel_pages = nblocks;
238 vacrelstats->nonempty_pages = 0;
240 lazy_space_alloc(vacrelstats, nblocks);
242 for (blkno = 0; blkno < nblocks; blkno++)
253 vacuum_delay_point();
256 * If we are close to overrunning the available space for dead-tuple
257 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
259 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
260 vacrelstats->num_dead_tuples > 0)
262 /* Remove index entries */
263 for (i = 0; i < nindexes; i++)
264 lazy_vacuum_index(Irel[i],
265 &index_tups_vacuumed[i],
266 &index_pages_removed[i],
268 did_vacuum_index = true;
269 /* Remove tuples from heap */
270 lazy_vacuum_heap(onerel, vacrelstats);
271 /* Forget the now-vacuumed tuples, and press on */
272 vacrelstats->num_dead_tuples = 0;
275 buf = ReadBuffer(onerel, blkno);
277 /* In this phase we only need shared access to the buffer */
278 LockBuffer(buf, BUFFER_LOCK_SHARE);
280 page = BufferGetPage(buf);
285 * An all-zeroes page could be left over if a backend extends the
286 * relation but crashes before initializing the page. Reclaim such
289 * We have to be careful here because we could be looking at a
290 * page that someone has just added to the relation and not yet
291 * been able to initialize (see RelationGetBufferForTuple). To
292 * interlock against that, release the buffer read lock (which we
293 * must do anyway) and grab the relation extension lock before
294 * re-locking in exclusive mode. If the page is still
295 * uninitialized by then, it must be left over from a crashed
296 * backend, and we can initialize it.
298 * We don't really need the relation lock when this is a new or
299 * temp relation, but it's probably not worth the code space to
300 * check that, since this surely isn't a critical path.
302 * Note: the comparable code in vacuum.c need not worry because
303 * it's got exclusive lock on the whole relation.
305 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
306 LockRelationForExtension(onerel, ExclusiveLock);
307 UnlockRelationForExtension(onerel, ExclusiveLock);
308 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
312 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
314 PageInit(page, BufferGetPageSize(buf), 0);
316 lazy_record_free_space(vacrelstats, blkno,
317 PageGetFreeSpace(page));
319 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
324 if (PageIsEmpty(page))
327 lazy_record_free_space(vacrelstats, blkno,
328 PageGetFreeSpace(page));
329 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
336 prev_dead_count = vacrelstats->num_dead_tuples;
337 maxoff = PageGetMaxOffsetNumber(page);
338 for (offnum = FirstOffsetNumber;
340 offnum = OffsetNumberNext(offnum))
344 itemid = PageGetItemId(page, offnum);
346 if (!ItemIdIsUsed(itemid))
352 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
353 tuple.t_len = ItemIdGetLength(itemid);
354 ItemPointerSet(&(tuple.t_self), blkno, offnum);
358 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
361 tupgone = true; /* we can delete the tuple */
366 * Tuple is good. Consider whether to replace its xmin
367 * value with FrozenTransactionId.
369 * NB: Since we hold only a shared buffer lock here, we
370 * are assuming that TransactionId read/write is atomic.
371 * This is not the only place that makes such an
372 * assumption. It'd be possible to avoid the assumption by
373 * momentarily acquiring exclusive lock, but for the
374 * moment I see no need to.
376 if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
377 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
380 HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
381 /* infomask should be okay already */
382 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
389 if (onerel->rd_rel->relhasoids &&
390 !OidIsValid(HeapTupleGetOid(&tuple)))
391 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
392 relname, blkno, offnum);
394 case HEAPTUPLE_RECENTLY_DEAD:
397 * If tuple is recently deleted then we must not remove it
402 case HEAPTUPLE_INSERT_IN_PROGRESS:
403 /* This is an expected case during concurrent vacuum */
405 case HEAPTUPLE_DELETE_IN_PROGRESS:
406 /* This is an expected case during concurrent vacuum */
409 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
415 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
423 } /* scan along page */
426 * If we remembered any tuples for deletion, then the page will be
427 * visited again by lazy_vacuum_heap, which will compute and record
428 * its post-compaction free space. If not, then we're done with this
429 * page, so remember its free space as-is.
431 if (vacrelstats->num_dead_tuples == prev_dead_count)
433 lazy_record_free_space(vacrelstats, blkno,
434 PageGetFreeSpace(page));
437 /* Remember the location of the last page with nonremovable tuples */
439 vacrelstats->nonempty_pages = blkno + 1;
441 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
449 /* save stats for use later */
450 vacrelstats->rel_tuples = num_tuples;
451 vacrelstats->tuples_deleted = tups_vacuumed;
453 /* If any tuples need to be deleted, perform final vacuum cycle */
454 /* XXX put a threshold on min number of tuples here? */
455 if (vacrelstats->num_dead_tuples > 0)
457 /* Remove index entries */
458 for (i = 0; i < nindexes; i++)
459 lazy_vacuum_index(Irel[i],
460 &index_tups_vacuumed[i],
461 &index_pages_removed[i],
463 /* Remove tuples from heap */
464 lazy_vacuum_heap(onerel, vacrelstats);
466 else if (!did_vacuum_index)
468 /* Must do post-vacuum cleanup and statistics update anyway */
469 for (i = 0; i < nindexes; i++)
470 lazy_scan_index(Irel[i], vacrelstats);
474 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
475 RelationGetRelationName(onerel),
476 tups_vacuumed, num_tuples, nblocks),
477 errdetail("%.0f dead row versions cannot be removed yet.\n"
478 "There were %.0f unused item pointers.\n"
479 "%u pages are entirely empty.\n"
484 pg_rusage_show(&ru0))));
489 * lazy_vacuum_heap() -- second pass over the heap
491 * This routine marks dead tuples as unused and compacts out free
492 * space on their pages. Pages not having dead tuples recorded from
493 * lazy_scan_heap are not visited at all.
495 * Note: the reason for doing this as a second pass is we cannot remove
496 * the tuples until we've removed their index entries, and we want to
497 * process index entry removal in batches as large as possible.
500 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
506 pg_rusage_init(&ru0);
510 while (tupindex < vacrelstats->num_dead_tuples)
516 vacuum_delay_point();
518 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
519 buf = ReadBuffer(onerel, tblk);
520 LockBufferForCleanup(buf);
521 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
522 /* Now that we've compacted the page, record its available space */
523 page = BufferGetPage(buf);
524 lazy_record_free_space(vacrelstats, tblk,
525 PageGetFreeSpace(page));
526 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
532 (errmsg("\"%s\": removed %d row versions in %d pages",
533 RelationGetRelationName(onerel),
536 pg_rusage_show(&ru0))));
540 * lazy_vacuum_page() -- free dead tuples on a page
541 * and repair its fragmentation.
543 * Caller is expected to handle reading, locking, and writing the buffer.
545 * tupindex is the index in vacrelstats->dead_tuples of the first dead
546 * tuple for this page. We assume the rest follow sequentially.
547 * The return value is the first tupindex after the tuples of this page.
550 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
551 int tupindex, LVRelStats *vacrelstats)
553 OffsetNumber unused[MaxOffsetNumber];
555 Page page = BufferGetPage(buffer);
558 START_CRIT_SECTION();
559 for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
564 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
566 break; /* past end of tuples for this block */
567 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
568 itemid = PageGetItemId(page, toff);
569 itemid->lp_flags &= ~LP_USED;
572 uncnt = PageRepairFragmentation(page, unused);
575 if (!onerel->rd_istemp)
579 recptr = log_heap_clean(onerel, buffer, unused, uncnt);
580 PageSetLSN(page, recptr);
581 PageSetTLI(page, ThisTimeLineID);
585 /* No XLOG record, but still need to flag that XID exists on disk */
586 MyXactMadeTempRelUpdate = true;
595 * lazy_scan_index() -- scan one index relation to update pg_class statistic.
597 * We use this when we have no deletions to do.
600 lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
602 IndexBulkDeleteResult *stats;
603 IndexVacuumCleanupInfo vcinfo;
606 pg_rusage_init(&ru0);
609 * Acquire appropriate type of lock on index: must be exclusive if index
610 * AM isn't concurrent-safe.
612 if (indrel->rd_am->amconcurrent)
613 LockRelation(indrel, RowExclusiveLock);
615 LockRelation(indrel, AccessExclusiveLock);
618 * Even though we're not planning to delete anything, we use the
619 * ambulkdelete call, because (a) the scan happens within the index AM for
620 * more speed, and (b) it may want to pass private statistics to the
621 * amvacuumcleanup call.
623 stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
625 /* Do post-VACUUM cleanup, even though we deleted nothing */
626 vcinfo.vacuum_full = false;
627 vcinfo.message_level = elevel;
628 vcinfo.num_heap_tuples = vacrelstats->rel_tuples;
630 stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
633 * Release lock acquired above.
635 if (indrel->rd_am->amconcurrent)
636 UnlockRelation(indrel, RowExclusiveLock);
638 UnlockRelation(indrel, AccessExclusiveLock);
643 /* now update statistics in pg_class */
644 vac_update_relstats(RelationGetRelid(indrel),
646 stats->num_index_tuples,
650 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
651 RelationGetRelationName(indrel),
652 stats->num_index_tuples,
654 errdetail("%u index pages have been deleted, %u are currently reusable.\n"
656 stats->pages_deleted, stats->pages_free,
657 pg_rusage_show(&ru0))));
663 * lazy_vacuum_index() -- vacuum one index relation.
665 * Delete all the index entries pointing to tuples listed in
666 * vacrelstats->dead_tuples.
668 * Increment *index_tups_vacuumed by the number of index entries
669 * removed, and *index_pages_removed by the number of pages removed.
671 * Finally, we arrange to update the index relation's statistics in
675 lazy_vacuum_index(Relation indrel,
676 double *index_tups_vacuumed,
677 BlockNumber *index_pages_removed,
678 LVRelStats *vacrelstats)
680 IndexBulkDeleteResult *stats;
681 IndexVacuumCleanupInfo vcinfo;
684 pg_rusage_init(&ru0);
687 * Acquire appropriate type of lock on index: must be exclusive if index
688 * AM isn't concurrent-safe.
690 if (indrel->rd_am->amconcurrent)
691 LockRelation(indrel, RowExclusiveLock);
693 LockRelation(indrel, AccessExclusiveLock);
695 /* Do bulk deletion */
696 stats = index_bulk_delete(indrel, lazy_tid_reaped, (void *) vacrelstats);
698 /* Do post-VACUUM cleanup */
699 vcinfo.vacuum_full = false;
700 vcinfo.message_level = elevel;
701 /* We don't yet know rel_tuples, so pass -1 */
702 /* index_bulk_delete can't have skipped scan anyway ... */
703 vcinfo.num_heap_tuples = -1;
705 stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
708 * Release lock acquired above.
710 if (indrel->rd_am->amconcurrent)
711 UnlockRelation(indrel, RowExclusiveLock);
713 UnlockRelation(indrel, AccessExclusiveLock);
718 /* accumulate total removed over multiple index-cleaning cycles */
719 *index_tups_vacuumed += stats->tuples_removed;
720 *index_pages_removed += stats->pages_removed;
722 /* now update statistics in pg_class */
723 vac_update_relstats(RelationGetRelid(indrel),
725 stats->num_index_tuples,
729 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
730 RelationGetRelationName(indrel),
731 stats->num_index_tuples,
733 errdetail("%.0f index row versions were removed.\n"
734 "%u index pages have been deleted, %u are currently reusable.\n"
736 stats->tuples_removed,
737 stats->pages_deleted, stats->pages_free,
738 pg_rusage_show(&ru0))));
744 * lazy_truncate_heap - try to truncate off any empty pages at the end
747 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
749 BlockNumber old_rel_pages = vacrelstats->rel_pages;
750 BlockNumber new_rel_pages;
751 PageFreeSpaceInfo *pageSpaces;
757 pg_rusage_init(&ru0);
760 * We need full exclusive lock on the relation in order to do truncation.
761 * If we can't get it, give up rather than waiting --- we don't want to
762 * block other backends, and we don't want to deadlock (which is quite
763 * possible considering we already hold a lower-grade lock).
765 if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
769 * Now that we have exclusive lock, look to see if the rel has grown
770 * whilst we were vacuuming with non-exclusive lock. If so, give up; the
771 * newly added pages presumably contain non-deletable tuples.
773 new_rel_pages = RelationGetNumberOfBlocks(onerel);
774 if (new_rel_pages != old_rel_pages)
776 /* might as well use the latest news when we update pg_class stats */
777 vacrelstats->rel_pages = new_rel_pages;
778 UnlockRelation(onerel, AccessExclusiveLock);
783 * Scan backwards from the end to verify that the end pages actually
784 * contain nothing we need to keep. This is *necessary*, not optional,
785 * because other backends could have added tuples to these pages whilst we
788 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
790 if (new_rel_pages >= old_rel_pages)
792 /* can't do anything after all */
793 UnlockRelation(onerel, AccessExclusiveLock);
800 RelationTruncate(onerel, new_rel_pages);
803 * Drop free-space info for removed blocks; these must not get entered
806 pageSpaces = vacrelstats->free_pages;
807 n = vacrelstats->num_free_pages;
809 for (i = 0; i < n; i++)
811 if (pageSpaces[i].blkno < new_rel_pages)
813 pageSpaces[j] = pageSpaces[i];
817 vacrelstats->num_free_pages = j;
818 /* We destroyed the heap ordering, so mark array unordered */
819 vacrelstats->fs_is_heap = false;
821 /* update statistics */
822 vacrelstats->rel_pages = new_rel_pages;
823 vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
826 * We keep the exclusive lock until commit (perhaps not necessary)?
830 (errmsg("\"%s\": truncated %u to %u pages",
831 RelationGetRelationName(onerel),
832 old_rel_pages, new_rel_pages),
834 pg_rusage_show(&ru0))));
838 * Rescan end pages to verify that they are (still) empty of needed tuples.
840 * Returns number of nondeletable pages (last nonempty page + 1).
843 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
848 /* Strange coding of loop control is needed because blkno is unsigned */
849 blkno = vacrelstats->rel_pages;
850 while (blkno > vacrelstats->nonempty_pages)
859 vacuum_delay_point();
863 buf = ReadBuffer(onerel, blkno);
865 /* In this phase we only need shared access to the buffer */
866 LockBuffer(buf, BUFFER_LOCK_SHARE);
868 page = BufferGetPage(buf);
870 if (PageIsNew(page) || PageIsEmpty(page))
872 /* PageIsNew probably shouldn't happen... */
873 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
879 maxoff = PageGetMaxOffsetNumber(page);
880 for (offnum = FirstOffsetNumber;
882 offnum = OffsetNumberNext(offnum))
886 itemid = PageGetItemId(page, offnum);
888 if (!ItemIdIsUsed(itemid))
891 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
892 tuple.t_len = ItemIdGetLength(itemid);
893 ItemPointerSet(&(tuple.t_self), blkno, offnum);
897 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
900 tupgone = true; /* we can delete the tuple */
903 /* Shouldn't be necessary to re-freeze anything */
905 case HEAPTUPLE_RECENTLY_DEAD:
908 * If tuple is recently deleted then we must not remove it
912 case HEAPTUPLE_INSERT_IN_PROGRESS:
913 /* This is an expected case during concurrent vacuum */
915 case HEAPTUPLE_DELETE_IN_PROGRESS:
916 /* This is an expected case during concurrent vacuum */
919 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
926 break; /* can stop scanning */
928 } /* scan along page */
930 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
934 /* Done scanning if we found a tuple here */
940 * If we fall out of the loop, all the previously-thought-to-be-empty
941 * pages really are; we need not bother to look at the last known-nonempty
944 return vacrelstats->nonempty_pages;
948 * lazy_space_alloc - space allocation decisions for lazy vacuum
950 * See the comments at the head of this file for rationale.
953 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
958 maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
959 maxtuples = Min(maxtuples, INT_MAX);
960 /* stay sane if small maintenance_work_mem */
961 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
963 vacrelstats->num_dead_tuples = 0;
964 vacrelstats->max_dead_tuples = (int) maxtuples;
965 vacrelstats->dead_tuples = (ItemPointer)
966 palloc(maxtuples * sizeof(ItemPointerData));
968 maxpages = MaxFSMPages;
969 /* No need to allocate more pages than the relation has blocks */
970 if (relblocks < (BlockNumber) maxpages)
971 maxpages = (int) relblocks;
973 vacrelstats->fs_is_heap = false;
974 vacrelstats->num_free_pages = 0;
975 vacrelstats->max_free_pages = maxpages;
976 vacrelstats->free_pages = (PageFreeSpaceInfo *)
977 palloc(maxpages * sizeof(PageFreeSpaceInfo));
981 * lazy_record_dead_tuple - remember one deletable tuple
984 lazy_record_dead_tuple(LVRelStats *vacrelstats,
988 * The array shouldn't overflow under normal behavior, but perhaps it
989 * could if we are given a really small maintenance_work_mem. In that
990 * case, just forget the last few tuples.
992 if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
994 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
995 vacrelstats->num_dead_tuples++;
1000 * lazy_record_free_space - remember free space on one page
1003 lazy_record_free_space(LVRelStats *vacrelstats,
1007 PageFreeSpaceInfo *pageSpaces;
1011 * A page with less than stats->threshold free space will be forgotten
1012 * immediately, and never passed to the free space map. Removing the
1013 * uselessly small entries early saves cycles, and in particular reduces
1014 * the amount of time we spend holding the FSM lock when we finally call
1015 * RecordRelationFreeSpace. Since the FSM will probably drop pages with
1016 * little free space anyway, there's no point in making this really small.
1018 * XXX Is it worth trying to measure average tuple size, and using that to
1019 * adjust the threshold? Would be worthwhile if FSM has no stats yet for
1020 * this relation. But changing the threshold as we scan the rel might
1021 * lead to bizarre behavior, too. Also, it's probably better if vacuum.c
1022 * has the same thresholding behavior as we do here.
1024 if (avail < vacrelstats->threshold)
1027 /* Copy pointers to local variables for notational simplicity */
1028 pageSpaces = vacrelstats->free_pages;
1029 n = vacrelstats->max_free_pages;
1031 /* If we haven't filled the array yet, just keep adding entries */
1032 if (vacrelstats->num_free_pages < n)
1034 pageSpaces[vacrelstats->num_free_pages].blkno = page;
1035 pageSpaces[vacrelstats->num_free_pages].avail = avail;
1036 vacrelstats->num_free_pages++;
1041 * The rest of this routine works with "heap" organization of the
1042 * free space arrays, wherein we maintain the heap property
1043 * avail[(j-1) div 2] <= avail[j] for 0 < j < n.
1044 * In particular, the zero'th element always has the smallest available
1045 * space and can be discarded to make room for a new page with more space.
1046 * See Knuth's discussion of heap-based priority queues, sec 5.2.3;
1047 * but note he uses 1-origin array subscripts, not 0-origin.
1051 /* If we haven't yet converted the array to heap organization, do it */
1052 if (!vacrelstats->fs_is_heap)
1055 * Scan backwards through the array, "sift-up" each value into its
1056 * correct position. We can start the scan at n/2-1 since each entry
1057 * above that position has no children to worry about.
1063 BlockNumber R = pageSpaces[l].blkno;
1064 Size K = pageSpaces[l].avail;
1065 int i; /* i is where the "hole" is */
1074 if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1076 if (K <= pageSpaces[j].avail)
1078 pageSpaces[i] = pageSpaces[j];
1081 pageSpaces[i].blkno = R;
1082 pageSpaces[i].avail = K;
1085 vacrelstats->fs_is_heap = true;
1088 /* If new page has more than zero'th entry, insert it into heap */
1089 if (avail > pageSpaces[0].avail)
1092 * Notionally, we replace the zero'th entry with the new data, and
1093 * then sift-up to maintain the heap property. Physically, the new
1094 * data doesn't get stored into the arrays until we find the right
1097 int i = 0; /* i is where the "hole" is */
1105 if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1107 if (avail <= pageSpaces[j].avail)
1109 pageSpaces[i] = pageSpaces[j];
1112 pageSpaces[i].blkno = page;
1113 pageSpaces[i].avail = avail;
1118 * lazy_tid_reaped() -- is a particular tid deletable?
1120 * This has the right signature to be an IndexBulkDeleteCallback.
1122 * Assumes dead_tuples array is in sorted order.
1125 lazy_tid_reaped(ItemPointer itemptr, void *state)
1127 LVRelStats *vacrelstats = (LVRelStats *) state;
1130 res = (ItemPointer) bsearch((void *) itemptr,
1131 (void *) vacrelstats->dead_tuples,
1132 vacrelstats->num_dead_tuples,
1133 sizeof(ItemPointerData),
1136 return (res != NULL);
1140 * Dummy version for lazy_scan_index.
1143 dummy_tid_reaped(ItemPointer itemptr, void *state)
1149 * Update the shared Free Space Map with the info we now have about
1150 * free space in the relation, discarding any old info the map may have.
1153 lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
1155 PageFreeSpaceInfo *pageSpaces = vacrelstats->free_pages;
1156 int nPages = vacrelstats->num_free_pages;
1159 * Sort data into order, as required by RecordRelationFreeSpace.
1162 qsort(pageSpaces, nPages, sizeof(PageFreeSpaceInfo),
1163 vac_cmp_page_spaces);
1165 RecordRelationFreeSpace(&onerel->rd_node, nPages, pageSpaces);
1169 * Comparator routines for use with qsort() and bsearch().
1172 vac_cmp_itemptr(const void *left, const void *right)
1179 lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1180 rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1187 loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1188 roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1199 vac_cmp_page_spaces(const void *left, const void *right)
1201 PageFreeSpaceInfo *linfo = (PageFreeSpaceInfo *) left;
1202 PageFreeSpaceInfo *rinfo = (PageFreeSpaceInfo *) right;
1204 if (linfo->blkno < rinfo->blkno)
1206 else if (linfo->blkno > rinfo->blkno)