1 /*-------------------------------------------------------------------------
4 * The postgres vacuum cleaner.
6 * This file includes the "full" version of VACUUM, as well as control code
7 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE. See
8 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
11 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
16 * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.360 2007/10/24 20:55:36 alvherre Exp $
18 *-------------------------------------------------------------------------
25 #include "access/clog.h"
26 #include "access/genam.h"
27 #include "access/heapam.h"
28 #include "access/transam.h"
29 #include "access/xact.h"
30 #include "access/xlog.h"
31 #include "catalog/namespace.h"
32 #include "catalog/pg_database.h"
33 #include "commands/dbcommands.h"
34 #include "commands/vacuum.h"
35 #include "executor/executor.h"
36 #include "miscadmin.h"
37 #include "postmaster/autovacuum.h"
38 #include "storage/freespace.h"
39 #include "storage/proc.h"
40 #include "storage/procarray.h"
41 #include "utils/acl.h"
42 #include "utils/builtins.h"
43 #include "utils/flatfiles.h"
44 #include "utils/fmgroids.h"
45 #include "utils/inval.h"
46 #include "utils/lsyscache.h"
47 #include "utils/memutils.h"
48 #include "utils/pg_rusage.h"
49 #include "utils/relcache.h"
50 #include "utils/syscache.h"
57 int vacuum_freeze_min_age;
60 * VacPage structures keep track of each page on which we find useful
61 * amounts of free space.
63 typedef struct VacPageData
65 BlockNumber blkno; /* BlockNumber of this Page */
66 Size free; /* FreeSpace on this Page */
67 uint16 offsets_used; /* Number of OffNums used by vacuum */
68 uint16 offsets_free; /* Number of OffNums free or to be free */
69 OffsetNumber offsets[1]; /* Array of free OffNums */
72 typedef VacPageData *VacPage;
74 typedef struct VacPageListData
76 BlockNumber empty_end_pages; /* Number of "empty" end-pages */
77 int num_pages; /* Number of pages in pagedesc */
78 int num_allocated_pages; /* Number of allocated pages in
80 VacPage *pagedesc; /* Descriptions of pages */
83 typedef VacPageListData *VacPageList;
86 * The "vtlinks" array keeps information about each recently-updated tuple
87 * ("recent" meaning its XMAX is too new to let us recycle the tuple).
88 * We store the tuple's own TID as well as its t_ctid (its link to the next
89 * newer tuple version). Searching in this array allows us to follow update
90 * chains backwards from newer to older tuples. When we move a member of an
91 * update chain, we must move *all* the live members of the chain, so that we
92 * can maintain their t_ctid link relationships (we must not just overwrite
93 * t_ctid in an existing tuple).
95 * Note: because t_ctid links can be stale (this would only occur if a prior
96 * VACUUM crashed partway through), it is possible that new_tid points to an
97 * empty slot or unrelated tuple. We have to check the linkage as we follow
98 * it, just as is done in EvalPlanQual.
100 typedef struct VTupleLinkData
102 ItemPointerData new_tid; /* t_ctid of an updated tuple */
103 ItemPointerData this_tid; /* t_self of the tuple */
106 typedef VTupleLinkData *VTupleLink;
109 * We use an array of VTupleMoveData to plan a chain tuple move fully
112 typedef struct VTupleMoveData
114 ItemPointerData tid; /* tuple ID */
115 VacPage vacpage; /* where to move it to */
116 bool cleanVpd; /* clean vacpage before using? */
119 typedef VTupleMoveData *VTupleMove;
122 * VRelStats contains the data acquired by scan_heap for use later
124 typedef struct VRelStats
126 /* miscellaneous statistics */
127 BlockNumber rel_pages; /* pages in relation */
128 double rel_tuples; /* tuples that remain after vacuuming */
129 double rel_indexed_tuples; /* indexed tuples that remain */
130 Size min_tlen; /* min surviving tuple size */
131 Size max_tlen; /* max surviving tuple size */
133 /* vtlinks array for tuple chain following - sorted by new_tid */
138 /*----------------------------------------------------------------------
141 * As these variables always appear together, we put them into one struct
142 * and pull initialization and cleanup into separate routines.
143 * ExecContext is used by repair_frag() and move_xxx_tuple(). More
144 * accurately: It is *used* only in move_xxx_tuple(), but because this
145 * routine is called many times, we initialize the struct just once in
146 * repair_frag() and pass it on to move_xxx_tuple().
148 typedef struct ExecContextData
150 ResultRelInfo *resultRelInfo;
152 TupleTableSlot *slot;
155 typedef ExecContextData *ExecContext;
158 ExecContext_Init(ExecContext ec, Relation rel)
160 TupleDesc tupdesc = RelationGetDescr(rel);
163 * We need a ResultRelInfo and an EState so we can use the regular
164 * executor's index-entry-making machinery.
166 ec->estate = CreateExecutorState();
168 ec->resultRelInfo = makeNode(ResultRelInfo);
169 ec->resultRelInfo->ri_RangeTableIndex = 1; /* dummy */
170 ec->resultRelInfo->ri_RelationDesc = rel;
171 ec->resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */
173 ExecOpenIndices(ec->resultRelInfo);
175 ec->estate->es_result_relations = ec->resultRelInfo;
176 ec->estate->es_num_result_relations = 1;
177 ec->estate->es_result_relation_info = ec->resultRelInfo;
179 /* Set up a tuple slot too */
180 ec->slot = MakeSingleTupleTableSlot(tupdesc);
184 ExecContext_Finish(ExecContext ec)
186 ExecDropSingleTupleTableSlot(ec->slot);
187 ExecCloseIndices(ec->resultRelInfo);
188 FreeExecutorState(ec->estate);
192 * End of ExecContext Implementation
193 *----------------------------------------------------------------------
196 /* A few variables that don't seem worth passing around as parameters */
197 static MemoryContext vac_context = NULL;
199 static int elevel = -1;
201 static TransactionId OldestXmin;
202 static TransactionId FreezeLimit;
204 static BufferAccessStrategy vac_strategy;
207 /* non-export function prototypes */
208 static List *get_rel_oids(List *relids, const RangeVar *vacrel,
209 const char *stmttype);
210 static void vac_truncate_clog(TransactionId frozenXID);
211 static void vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
212 static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
213 static void scan_heap(VRelStats *vacrelstats, Relation onerel,
214 VacPageList vacuum_pages, VacPageList fraged_pages);
215 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
216 VacPageList vacuum_pages, VacPageList fraged_pages,
217 int nindexes, Relation *Irel);
218 static void move_chain_tuple(Relation rel,
219 Buffer old_buf, Page old_page, HeapTuple old_tup,
220 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
221 ExecContext ec, ItemPointer ctid, bool cleanVpd);
222 static void move_plain_tuple(Relation rel,
223 Buffer old_buf, Page old_page, HeapTuple old_tup,
224 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
226 static void update_hint_bits(Relation rel, VacPageList fraged_pages,
227 int num_fraged_pages, BlockNumber last_move_dest_block,
229 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
230 VacPageList vacpagelist);
231 static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
232 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
233 double num_tuples, int keep_tuples);
234 static void scan_index(Relation indrel, double num_tuples);
235 static bool tid_reaped(ItemPointer itemptr, void *state);
236 static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
237 BlockNumber rel_pages);
238 static VacPage copy_vac_page(VacPage vacpage);
239 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
240 static void *vac_bsearch(const void *key, const void *base,
241 size_t nelem, size_t size,
242 int (*compar) (const void *, const void *));
243 static int vac_cmp_blk(const void *left, const void *right);
244 static int vac_cmp_offno(const void *left, const void *right);
245 static int vac_cmp_vtlinks(const void *left, const void *right);
246 static bool enough_space(VacPage vacpage, Size len);
247 static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page);
250 /****************************************************************************
252 * Code common to all flavors of VACUUM and ANALYZE *
254 ****************************************************************************
259 * Primary entry point for VACUUM and ANALYZE commands.
261 * relids is normally NIL; if it is not, then it provides the list of
262 * relation OIDs to be processed, and vacstmt->relation is ignored.
263 * (The non-NIL case is currently only used by autovacuum.)
265 * bstrategy is normally given as NULL, but in autovacuum it can be passed
266 * in to use the same buffer strategy object across multiple vacuum() calls.
268 * isTopLevel should be passed down from ProcessUtility.
270 * It is the caller's responsibility that vacstmt, relids, and bstrategy
271 * (if given) be allocated in a memory context that won't disappear
272 * at transaction commit.
275 vacuum(VacuumStmt *vacstmt, List *relids,
276 BufferAccessStrategy bstrategy, bool isTopLevel)
278 const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
279 volatile MemoryContext anl_context = NULL;
280 volatile bool all_rels,
285 if (vacstmt->verbose)
291 * We cannot run VACUUM inside a user transaction block; if we were inside
292 * a transaction, then our commit- and start-transaction-command calls
293 * would not have the intended effect! Furthermore, the forced commit that
294 * occurs before truncating the relation's file would have the effect of
295 * committing the rest of the user's transaction too, which would
296 * certainly not be the desired behavior. (This only applies to VACUUM
297 * FULL, though. We could in theory run lazy VACUUM inside a transaction
298 * block, but we choose to disallow that case because we'd rather commit
299 * as soon as possible after finishing the vacuum. This is mainly so that
300 * we can let go the AccessExclusiveLock that we may be holding.)
302 * ANALYZE (without VACUUM) can run either way.
306 PreventTransactionChain(isTopLevel, stmttype);
307 in_outer_xact = false;
310 in_outer_xact = IsInTransactionChain(isTopLevel);
313 * Send info about dead objects to the statistics collector, unless we are
314 * in autovacuum --- autovacuum.c does this for itself.
316 if (vacstmt->vacuum && !IsAutoVacuumWorkerProcess())
317 pgstat_vacuum_tabstat();
320 * Create special memory context for cross-transaction storage.
322 * Since it is a child of PortalContext, it will go away eventually even
323 * if we suffer an error; there's no need for special abort cleanup logic.
325 vac_context = AllocSetContextCreate(PortalContext,
327 ALLOCSET_DEFAULT_MINSIZE,
328 ALLOCSET_DEFAULT_INITSIZE,
329 ALLOCSET_DEFAULT_MAXSIZE);
332 * If caller didn't give us a buffer strategy object, make one in the
333 * cross-transaction memory context.
335 if (bstrategy == NULL)
337 MemoryContext old_context = MemoryContextSwitchTo(vac_context);
339 bstrategy = GetAccessStrategy(BAS_VACUUM);
340 MemoryContextSwitchTo(old_context);
342 vac_strategy = bstrategy;
344 /* Remember whether we are processing everything in the DB */
345 all_rels = (relids == NIL && vacstmt->relation == NULL);
348 * Build list of relations to process, unless caller gave us one. (If we
349 * build one, we put it in vac_context for safekeeping.)
351 relations = get_rel_oids(relids, vacstmt->relation, stmttype);
354 * Decide whether we need to start/commit our own transactions.
356 * For VACUUM (with or without ANALYZE): always do so, so that we can
357 * release locks as soon as possible. (We could possibly use the outer
358 * transaction for a one-table VACUUM, but handling TOAST tables would be
361 * For ANALYZE (no VACUUM): if inside a transaction block, we cannot
362 * start/commit our own transactions. Also, there's no need to do so if
363 * only processing one relation. For multiple relations when not within a
364 * transaction block, and also in an autovacuum worker, use own
365 * transactions so we can release locks sooner.
368 use_own_xacts = true;
371 Assert(vacstmt->analyze);
372 if (IsAutoVacuumWorkerProcess())
373 use_own_xacts = true;
374 else if (in_outer_xact)
375 use_own_xacts = false;
376 else if (list_length(relations) > 1)
377 use_own_xacts = true;
379 use_own_xacts = false;
383 * If we are running ANALYZE without per-table transactions, we'll need a
384 * memory context with table lifetime.
387 anl_context = AllocSetContextCreate(PortalContext,
389 ALLOCSET_DEFAULT_MINSIZE,
390 ALLOCSET_DEFAULT_INITSIZE,
391 ALLOCSET_DEFAULT_MAXSIZE);
394 * vacuum_rel expects to be entered with no transaction active; it will
395 * start and commit its own transaction. But we are called by an SQL
396 * command, and so we are executing inside a transaction already. We
397 * commit the transaction started in PostgresMain() here, and start
398 * another one before exiting to match the commit waiting for us back in
403 /* matches the StartTransaction in PostgresMain() */
404 CommitTransactionCommand();
407 /* Turn vacuum cost accounting on or off */
412 VacuumCostActive = (VacuumCostDelay > 0);
413 VacuumCostBalance = 0;
416 * Loop to process each selected relation.
418 foreach(cur, relations)
420 Oid relid = lfirst_oid(cur);
423 vacuum_rel(relid, vacstmt, RELKIND_RELATION);
425 if (vacstmt->analyze)
427 MemoryContext old_context = NULL;
430 * If using separate xacts, start one for analyze. Otherwise,
431 * we can use the outer transaction, but we still need to call
432 * analyze_rel in a memory context that will be cleaned up on
433 * return (else we leak memory while processing multiple
438 StartTransactionCommand();
439 /* functions in indexes may want a snapshot set */
440 ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
443 old_context = MemoryContextSwitchTo(anl_context);
445 analyze_rel(relid, vacstmt, vac_strategy);
448 CommitTransactionCommand();
451 MemoryContextSwitchTo(old_context);
452 MemoryContextResetAndDeleteChildren(anl_context);
459 /* Make sure cost accounting is turned off after error */
460 VacuumCostActive = false;
465 /* Turn off vacuum cost accounting */
466 VacuumCostActive = false;
469 * Finish up processing.
473 /* here, we are not in a transaction */
476 * This matches the CommitTransaction waiting for us in
479 StartTransactionCommand();
482 * Re-establish the transaction snapshot. This is wasted effort when
483 * we are called as a normal utility command, because the new
484 * transaction will be dropped immediately by PostgresMain(); but it's
485 * necessary if we are called from autovacuum because autovacuum might
486 * continue on to do an ANALYZE-only call.
488 ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
491 if (vacstmt->vacuum && !IsAutoVacuumWorkerProcess())
494 * Update pg_database.datfrozenxid, and truncate pg_clog if possible.
495 * (autovacuum.c does this for itself.)
497 vac_update_datfrozenxid();
500 * If it was a database-wide VACUUM, print FSM usage statistics (we
501 * don't make you be superuser to see these). We suppress this in
505 PrintFreeSpaceMapStatistics(elevel);
509 * Clean up working storage --- note we must do this after
510 * StartTransactionCommand, else we might be trying to delete the active
513 MemoryContextDelete(vac_context);
517 MemoryContextDelete(anl_context);
521 * Build a list of Oids for each relation to be processed
523 * The list is built in vac_context so that it will survive across our
524 * per-relation transactions.
527 get_rel_oids(List *relids, const RangeVar *vacrel, const char *stmttype)
529 List *oid_list = NIL;
530 MemoryContext oldcontext;
532 /* List supplied by VACUUM's caller? */
538 /* Process a specific relation */
541 relid = RangeVarGetRelid(vacrel, false);
543 /* Make a relation list entry for this guy */
544 oldcontext = MemoryContextSwitchTo(vac_context);
545 oid_list = lappend_oid(oid_list, relid);
546 MemoryContextSwitchTo(oldcontext);
550 /* Process all plain relations listed in pg_class */
557 Anum_pg_class_relkind,
558 BTEqualStrategyNumber, F_CHAREQ,
559 CharGetDatum(RELKIND_RELATION));
561 pgclass = heap_open(RelationRelationId, AccessShareLock);
563 scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
565 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
567 /* Make a relation list entry for this guy */
568 oldcontext = MemoryContextSwitchTo(vac_context);
569 oid_list = lappend_oid(oid_list, HeapTupleGetOid(tuple));
570 MemoryContextSwitchTo(oldcontext);
574 heap_close(pgclass, AccessShareLock);
581 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
584 vacuum_set_xid_limits(int freeze_min_age, bool sharedRel,
585 TransactionId *oldestXmin,
586 TransactionId *freezeLimit)
590 TransactionId safeLimit;
593 * We can always ignore processes running lazy vacuum. This is because we
594 * use these values only for deciding which tuples we must keep in the
595 * tables. Since lazy vacuum doesn't write its XID anywhere, it's
596 * safe to ignore it. In theory it could be problematic to ignore lazy
597 * vacuums on a full vacuum, but keep in mind that only one vacuum process
598 * can be working on a particular table at any time, and that each vacuum
599 * is always an independent transaction.
601 *oldestXmin = GetOldestXmin(sharedRel, true);
603 Assert(TransactionIdIsNormal(*oldestXmin));
606 * Determine the minimum freeze age to use: as specified by the caller,
607 * or vacuum_freeze_min_age, but in any case not more than half
608 * autovacuum_freeze_max_age, so that autovacuums to prevent XID
609 * wraparound won't occur too frequently.
611 freezemin = freeze_min_age;
613 freezemin = vacuum_freeze_min_age;
614 freezemin = Min(freezemin, autovacuum_freeze_max_age / 2);
615 Assert(freezemin >= 0);
618 * Compute the cutoff XID, being careful not to generate a "permanent" XID
620 limit = *oldestXmin - freezemin;
621 if (!TransactionIdIsNormal(limit))
622 limit = FirstNormalTransactionId;
625 * If oldestXmin is very far back (in practice, more than
626 * autovacuum_freeze_max_age / 2 XIDs old), complain and force a
627 * minimum freeze age of zero.
629 safeLimit = ReadNewTransactionId() - autovacuum_freeze_max_age;
630 if (!TransactionIdIsNormal(safeLimit))
631 safeLimit = FirstNormalTransactionId;
633 if (TransactionIdPrecedes(limit, safeLimit))
636 (errmsg("oldest xmin is far in the past"),
637 errhint("Close open transactions soon to avoid wraparound problems.")));
641 *freezeLimit = limit;
646 * vac_update_relstats() -- update statistics for one relation
648 * Update the whole-relation statistics that are kept in its pg_class
649 * row. There are additional stats that will be updated if we are
650 * doing ANALYZE, but we always update these stats. This routine works
651 * for both index and heap relation entries in pg_class.
653 * We violate transaction semantics here by overwriting the rel's
654 * existing pg_class tuple with the new values. This is reasonably
655 * safe since the new values are correct whether or not this transaction
656 * commits. The reason for this is that if we updated these tuples in
657 * the usual way, vacuuming pg_class itself wouldn't work very well ---
658 * by the time we got done with a vacuum cycle, most of the tuples in
659 * pg_class would've been obsoleted. Of course, this only works for
660 * fixed-size never-null columns, but these are.
662 * Another reason for doing it this way is that when we are in a lazy
663 * VACUUM and have PROC_IN_VACUUM set, we mustn't do any updates ---
664 * somebody vacuuming pg_class might think they could delete a tuple
665 * marked with xmin = our xid.
667 * This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
671 vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
672 bool hasindex, TransactionId frozenxid)
676 Form_pg_class pgcform;
679 rd = heap_open(RelationRelationId, RowExclusiveLock);
681 /* Fetch a copy of the tuple to scribble on */
682 ctup = SearchSysCacheCopy(RELOID,
683 ObjectIdGetDatum(relid),
685 if (!HeapTupleIsValid(ctup))
686 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
688 pgcform = (Form_pg_class) GETSTRUCT(ctup);
690 /* Apply required updates, if any, to copied tuple */
693 if (pgcform->relpages != (int32) num_pages)
695 pgcform->relpages = (int32) num_pages;
698 if (pgcform->reltuples != (float4) num_tuples)
700 pgcform->reltuples = (float4) num_tuples;
703 if (pgcform->relhasindex != hasindex)
705 pgcform->relhasindex = hasindex;
710 * If we have discovered that there are no indexes, then there's no
711 * primary key either. This could be done more thoroughly...
715 if (pgcform->relhaspkey)
717 pgcform->relhaspkey = false;
723 * relfrozenxid should never go backward. Caller can pass
724 * InvalidTransactionId if it has no new data.
726 if (TransactionIdIsNormal(frozenxid) &&
727 TransactionIdPrecedes(pgcform->relfrozenxid, frozenxid))
729 pgcform->relfrozenxid = frozenxid;
734 * If anything changed, write out the tuple. Even if nothing changed,
735 * force relcache invalidation so all backends reset their rd_targblock
736 * --- otherwise it might point to a page we truncated away.
740 heap_inplace_update(rd, ctup);
741 /* the above sends a cache inval message */
745 /* no need to change tuple, but force relcache inval anyway */
746 CacheInvalidateRelcacheByTuple(ctup);
749 heap_close(rd, RowExclusiveLock);
754 * vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB
756 * Update pg_database's datfrozenxid entry for our database to be the
757 * minimum of the pg_class.relfrozenxid values. If we are able to
758 * advance pg_database.datfrozenxid, also try to truncate pg_clog.
760 * We violate transaction semantics here by overwriting the database's
761 * existing pg_database tuple with the new value. This is reasonably
762 * safe since the new value is correct whether or not this transaction
763 * commits. As with vac_update_relstats, this avoids leaving dead tuples
764 * behind after a VACUUM.
766 * This routine is shared by full and lazy VACUUM.
769 vac_update_datfrozenxid(void)
772 Form_pg_database dbform;
776 TransactionId newFrozenXid;
780 * Initialize the "min" calculation with RecentGlobalXmin. Any
781 * not-yet-committed pg_class entries for new tables must have
782 * relfrozenxid at least this high, because any other open xact must have
783 * RecentXmin >= its PGPROC.xmin >= our RecentGlobalXmin; see
784 * AddNewRelationTuple(). So we cannot produce a wrong minimum by
785 * starting with this.
787 newFrozenXid = RecentGlobalXmin;
790 * We must seqscan pg_class to find the minimum Xid, because there is no
791 * index that can help us here.
793 relation = heap_open(RelationRelationId, AccessShareLock);
795 scan = systable_beginscan(relation, InvalidOid, false,
796 SnapshotNow, 0, NULL);
798 while ((classTup = systable_getnext(scan)) != NULL)
800 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(classTup);
803 * Only consider heap and TOAST tables (anything else should have
804 * InvalidTransactionId in relfrozenxid anyway.)
806 if (classForm->relkind != RELKIND_RELATION &&
807 classForm->relkind != RELKIND_TOASTVALUE)
810 Assert(TransactionIdIsNormal(classForm->relfrozenxid));
812 if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid))
813 newFrozenXid = classForm->relfrozenxid;
816 /* we're done with pg_class */
817 systable_endscan(scan);
818 heap_close(relation, AccessShareLock);
820 Assert(TransactionIdIsNormal(newFrozenXid));
822 /* Now fetch the pg_database tuple we need to update. */
823 relation = heap_open(DatabaseRelationId, RowExclusiveLock);
825 /* Fetch a copy of the tuple to scribble on */
826 tuple = SearchSysCacheCopy(DATABASEOID,
827 ObjectIdGetDatum(MyDatabaseId),
829 if (!HeapTupleIsValid(tuple))
830 elog(ERROR, "could not find tuple for database %u", MyDatabaseId);
831 dbform = (Form_pg_database) GETSTRUCT(tuple);
834 * Don't allow datfrozenxid to go backward (probably can't happen anyway);
835 * and detect the common case where it doesn't go forward either.
837 if (TransactionIdPrecedes(dbform->datfrozenxid, newFrozenXid))
839 dbform->datfrozenxid = newFrozenXid;
844 heap_inplace_update(relation, tuple);
846 heap_freetuple(tuple);
847 heap_close(relation, RowExclusiveLock);
850 * If we were able to advance datfrozenxid, mark the flat-file copy of
851 * pg_database for update at commit, and see if we can truncate
856 database_file_update_needed();
857 vac_truncate_clog(newFrozenXid);
863 * vac_truncate_clog() -- attempt to truncate the commit log
865 * Scan pg_database to determine the system-wide oldest datfrozenxid,
866 * and use it to truncate the transaction commit log (pg_clog).
867 * Also update the XID wrap limit info maintained by varsup.c.
869 * The passed XID is simply the one I just wrote into my pg_database
870 * entry. It's used to initialize the "min" calculation.
872 * This routine is shared by full and lazy VACUUM. Note that it's
873 * only invoked when we've managed to change our DB's datfrozenxid
877 vac_truncate_clog(TransactionId frozenXID)
879 TransactionId myXID = GetCurrentTransactionId();
883 NameData oldest_datname;
884 bool frozenAlreadyWrapped = false;
886 /* init oldest_datname to sync with my frozenXID */
887 namestrcpy(&oldest_datname, get_database_name(MyDatabaseId));
890 * Scan pg_database to compute the minimum datfrozenxid
892 * Note: we need not worry about a race condition with new entries being
893 * inserted by CREATE DATABASE. Any such entry will have a copy of some
894 * existing DB's datfrozenxid, and that source DB cannot be ours because
895 * of the interlock against copying a DB containing an active backend.
896 * Hence the new entry will not reduce the minimum. Also, if two
897 * VACUUMs concurrently modify the datfrozenxid's of different databases,
898 * the worst possible outcome is that pg_clog is not truncated as
899 * aggressively as it could be.
901 relation = heap_open(DatabaseRelationId, AccessShareLock);
903 scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
905 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
907 Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
909 Assert(TransactionIdIsNormal(dbform->datfrozenxid));
911 if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
912 frozenAlreadyWrapped = true;
913 else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
915 frozenXID = dbform->datfrozenxid;
916 namecpy(&oldest_datname, &dbform->datname);
922 heap_close(relation, AccessShareLock);
925 * Do not truncate CLOG if we seem to have suffered wraparound already;
926 * the computed minimum XID might be bogus. This case should now be
927 * impossible due to the defenses in GetNewTransactionId, but we keep the
930 if (frozenAlreadyWrapped)
933 (errmsg("some databases have not been vacuumed in over 2 billion transactions"),
934 errdetail("You might have already suffered transaction-wraparound data loss.")));
938 /* Truncate CLOG to the oldest frozenxid */
939 TruncateCLOG(frozenXID);
942 * Update the wrap limit for GetNewTransactionId. Note: this function
943 * will also signal the postmaster for an(other) autovac cycle if needed.
945 SetTransactionIdLimit(frozenXID, &oldest_datname);
949 /****************************************************************************
951 * Code common to both flavors of VACUUM *
953 ****************************************************************************
958 * vacuum_rel() -- vacuum one heap relation
960 * Doing one heap at a time incurs extra overhead, since we need to
961 * check that the heap exists again just before we vacuum it. The
962 * reason that we do this is so that vacuuming can be spread across
963 * many small transactions. Otherwise, two-phase locking would require
964 * us to lock the entire database during one pass of the vacuum cleaner.
966 * At entry and exit, we are not inside a transaction.
969 vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
976 /* Begin a transaction for vacuuming this relation */
977 StartTransactionCommand();
981 /* functions in indexes may want a snapshot set */
982 ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
987 * During a lazy VACUUM we do not run any user-supplied functions, and
988 * so it should be safe to not create a transaction snapshot.
990 * We can furthermore set the PROC_IN_VACUUM flag, which lets other
991 * concurrent VACUUMs know that they can ignore this one while
992 * determining their OldestXmin. (The reason we don't set it
993 * during a full VACUUM is exactly that we may have to run user-
994 * defined functions for functional indexes, and we want to make sure
995 * that if they use the snapshot set above, any tuples it requires
996 * can't get removed from other tables. An index function that
997 * depends on the contents of other tables is arguably broken, but we
998 * won't break it here by violating transaction semantics.)
1000 * Note: this flag remains set until CommitTransaction or
1001 * AbortTransaction. We don't want to clear it until we reset
1002 * MyProc->xid/xmin, else OldestXmin might appear to go backwards,
1003 * which is probably Not Good.
1005 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
1006 MyProc->vacuumFlags |= PROC_IN_VACUUM;
1007 LWLockRelease(ProcArrayLock);
1011 * Check for user-requested abort. Note we want this to be inside a
1012 * transaction, so xact.c doesn't issue useless WARNING.
1014 CHECK_FOR_INTERRUPTS();
1017 * Determine the type of lock we want --- hard exclusive lock for a FULL
1018 * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either
1019 * way, we can be sure that no other backend is vacuuming the same table.
1021 lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;
1024 * Open the relation and get the appropriate lock on it.
1026 * There's a race condition here: the rel may have gone away since the
1027 * last time we saw it. If so, we don't need to vacuum it.
1029 onerel = try_relation_open(relid, lmode);
1033 CommitTransactionCommand();
1038 * Check permissions.
1040 * We allow the user to vacuum a table if he is superuser, the table
1041 * owner, or the database owner (but in the latter case, only if it's not
1042 * a shared relation). pg_class_ownercheck includes the superuser case.
1044 * Note we choose to treat permissions failure as a WARNING and keep
1045 * trying to vacuum the rest of the DB --- is this appropriate?
1047 if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
1048 (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !onerel->rd_rel->relisshared)))
1051 (errmsg("skipping \"%s\" --- only table or database owner can vacuum it",
1052 RelationGetRelationName(onerel))));
1053 relation_close(onerel, lmode);
1054 CommitTransactionCommand();
1059 * Check that it's a plain table; we used to do this in get_rel_oids() but
1060 * seems safer to check after we've locked the relation.
1062 if (onerel->rd_rel->relkind != expected_relkind)
1065 (errmsg("skipping \"%s\" --- cannot vacuum indexes, views, or special system tables",
1066 RelationGetRelationName(onerel))));
1067 relation_close(onerel, lmode);
1068 CommitTransactionCommand();
1073 * Silently ignore tables that are temp tables of other backends ---
1074 * trying to vacuum these will lead to great unhappiness, since their
1075 * contents are probably not up-to-date on disk. (We don't throw a
1076 * warning here; it would just lead to chatter during a database-wide
1079 if (isOtherTempNamespace(RelationGetNamespace(onerel)))
1081 relation_close(onerel, lmode);
1082 CommitTransactionCommand();
1087 * Get a session-level lock too. This will protect our access to the
1088 * relation across multiple transactions, so that we can vacuum the
1089 * relation's TOAST table (if any) secure in the knowledge that no one is
1090 * deleting the parent relation.
1092 * NOTE: this cannot block, even if someone else is waiting for access,
1093 * because the lock manager knows that both lock requests are from the
1096 onerelid = onerel->rd_lockInfo.lockRelId;
1097 LockRelationIdForSession(&onerelid, lmode);
1100 * Remember the relation's TOAST relation for later
1102 toast_relid = onerel->rd_rel->reltoastrelid;
1105 * Do the actual work --- either FULL or "lazy" vacuum
1108 full_vacuum_rel(onerel, vacstmt);
1110 lazy_vacuum_rel(onerel, vacstmt, vac_strategy);
1112 /* all done with this class, but hold lock until commit */
1113 relation_close(onerel, NoLock);
1116 * Complete the transaction and free all temporary memory used.
1118 CommitTransactionCommand();
1121 * If the relation has a secondary toast rel, vacuum that too while we
1122 * still hold the session lock on the master table. Note however that
1123 * "analyze" will not get done on the toast table. This is good, because
1124 * the toaster always uses hardcoded index access and statistics are
1125 * totally unimportant for toast relations.
1127 if (toast_relid != InvalidOid)
1128 vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE);
1131 * Now release the session-level lock on the master table.
1133 UnlockRelationIdForSession(&onerelid, lmode);
1137 /****************************************************************************
1139 * Code for VACUUM FULL (only) *
1141 ****************************************************************************
1146 * full_vacuum_rel() -- perform FULL VACUUM for one heap relation
1148 * This routine vacuums a single heap, cleans out its indexes, and
1149 * updates its num_pages and num_tuples statistics.
1151 * At entry, we have already established a transaction and opened
1152 * and locked the relation.
1155 full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
1157 VacPageListData vacuum_pages; /* List of pages to vacuum and/or
1159 VacPageListData fraged_pages; /* List of pages with space enough for
1164 VRelStats *vacrelstats;
1166 vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
1167 &OldestXmin, &FreezeLimit);
1170 * Flush any previous async-commit transactions. This does not guarantee
1171 * that we will be able to set hint bits for tuples they inserted, but
1172 * it improves the probability, especially in simple sequential-commands
1173 * cases. See scan_heap() and repair_frag() for more about this.
1175 XLogAsyncCommitFlush();
1178 * Set up statistics-gathering machinery.
1180 vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
1181 vacrelstats->rel_pages = 0;
1182 vacrelstats->rel_tuples = 0;
1183 vacrelstats->rel_indexed_tuples = 0;
1184 vacrelstats->hasindex = false;
1187 vacuum_pages.num_pages = fraged_pages.num_pages = 0;
1188 scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
1190 /* Now open all indexes of the relation */
1191 vac_open_indexes(onerel, AccessExclusiveLock, &nindexes, &Irel);
1193 vacrelstats->hasindex = true;
1195 /* Clean/scan index relation(s) */
1198 if (vacuum_pages.num_pages > 0)
1200 for (i = 0; i < nindexes; i++)
1201 vacuum_index(&vacuum_pages, Irel[i],
1202 vacrelstats->rel_indexed_tuples, 0);
1206 /* just scan indexes to update statistic */
1207 for (i = 0; i < nindexes; i++)
1208 scan_index(Irel[i], vacrelstats->rel_indexed_tuples);
1212 if (fraged_pages.num_pages > 0)
1214 /* Try to shrink heap */
1215 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
1217 vac_close_indexes(nindexes, Irel, NoLock);
1221 vac_close_indexes(nindexes, Irel, NoLock);
1222 if (vacuum_pages.num_pages > 0)
1224 /* Clean pages from vacuum_pages list */
1225 vacuum_heap(vacrelstats, onerel, &vacuum_pages);
1229 /* update shared free space map with final free space info */
1230 vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
1232 /* update statistics in pg_class */
1233 vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
1234 vacrelstats->rel_tuples, vacrelstats->hasindex,
1237 /* report results to the stats collector, too */
1238 pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
1239 vacstmt->analyze, vacrelstats->rel_tuples);
1244 * scan_heap() -- scan an open heap relation
1246 * This routine sets commit status bits, constructs vacuum_pages (list
1247 * of pages we need to compact free space on and/or clean indexes of
1248 * deleted tuples), constructs fraged_pages (list of pages with free
1249 * space that tuples could be moved into), and calculates statistics
1250 * on the number of live tuples in the heap.
1253 scan_heap(VRelStats *vacrelstats, Relation onerel,
1254 VacPageList vacuum_pages, VacPageList fraged_pages)
1256 BlockNumber nblocks,
1260 BlockNumber empty_pages,
1269 Size min_tlen = MaxHeapTupleSize;
1271 bool do_shrinking = true;
1272 VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
1273 int num_vtlinks = 0;
1274 int free_vtlinks = 100;
1277 pg_rusage_init(&ru0);
1279 relname = RelationGetRelationName(onerel);
1281 (errmsg("vacuuming \"%s.%s\"",
1282 get_namespace_name(RelationGetNamespace(onerel)),
1285 empty_pages = empty_end_pages = 0;
1286 num_tuples = num_indexed_tuples = tups_vacuumed = nkeep = nunused = 0;
1289 nblocks = RelationGetNumberOfBlocks(onerel);
1292 * We initially create each VacPage item in a maximal-sized workspace,
1293 * then copy the workspace into a just-large-enough copy.
1295 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1297 for (blkno = 0; blkno < nblocks; blkno++)
1304 OffsetNumber offnum,
1307 OffsetNumber frozen[MaxOffsetNumber];
1310 vacuum_delay_point();
1312 buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
1313 page = BufferGetPage(buf);
1316 * Since we are holding exclusive lock on the relation, no other
1317 * backend can be accessing the page; however it is possible that the
1318 * background writer will try to write the page if it's already marked
1319 * dirty. To ensure that invalid data doesn't get written to disk, we
1320 * must take exclusive buffer lock wherever we potentially modify
1321 * pages. In fact, we insist on cleanup lock so that we can safely
1322 * call heap_page_prune(). (This might be overkill, since the bgwriter
1323 * pays no attention to individual tuples, but on the other hand it's
1324 * unlikely that the bgwriter has this particular page pinned at this
1325 * instant. So violating the coding rule would buy us little anyway.)
1327 LockBufferForCleanup(buf);
1329 vacpage->blkno = blkno;
1330 vacpage->offsets_used = 0;
1331 vacpage->offsets_free = 0;
1333 if (PageIsNew(page))
1335 VacPage vacpagecopy;
1338 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
1340 PageInit(page, BufferGetPageSize(buf), 0);
1341 MarkBufferDirty(buf);
1342 vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
1343 free_space += vacpage->free;
1346 vacpagecopy = copy_vac_page(vacpage);
1347 vpage_insert(vacuum_pages, vacpagecopy);
1348 vpage_insert(fraged_pages, vacpagecopy);
1349 UnlockReleaseBuffer(buf);
1353 if (PageIsEmpty(page))
1355 VacPage vacpagecopy;
1357 vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
1358 free_space += vacpage->free;
1361 vacpagecopy = copy_vac_page(vacpage);
1362 vpage_insert(vacuum_pages, vacpagecopy);
1363 vpage_insert(fraged_pages, vacpagecopy);
1364 UnlockReleaseBuffer(buf);
1369 * Prune all HOT-update chains in this page.
1371 * We use the redirect_move option so that redirecting line pointers
1372 * get collapsed out; this allows us to not worry about them below.
1374 * We count tuples removed by the pruning step as removed by VACUUM.
1376 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
1380 * Now scan the page to collect vacuumable items and check for
1381 * tuples requiring freezing.
1385 maxoff = PageGetMaxOffsetNumber(page);
1386 for (offnum = FirstOffsetNumber;
1388 offnum = OffsetNumberNext(offnum))
1390 ItemId itemid = PageGetItemId(page, offnum);
1391 bool tupgone = false;
1392 HeapTupleData tuple;
1395 * Collect un-used items too - it's possible to have indexes
1396 * pointing here after crash. (That's an ancient comment and
1397 * is likely obsolete with WAL, but we might as well continue
1398 * to check for such problems.)
1400 if (!ItemIdIsUsed(itemid))
1402 vacpage->offsets[vacpage->offsets_free++] = offnum;
1408 * DEAD item pointers are to be vacuumed normally; but we don't
1409 * count them in tups_vacuumed, else we'd be double-counting
1410 * (at least in the common case where heap_page_prune() just
1411 * freed up a non-HOT tuple).
1413 if (ItemIdIsDead(itemid))
1415 vacpage->offsets[vacpage->offsets_free++] = offnum;
1419 /* Shouldn't have any redirected items anymore */
1420 if (!ItemIdIsNormal(itemid))
1421 elog(ERROR, "relation \"%s\" TID %u/%u: unexpected redirect item",
1422 relname, blkno, offnum);
1424 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1425 tuple.t_len = ItemIdGetLength(itemid);
1426 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1428 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
1430 case HEAPTUPLE_LIVE:
1431 /* Tuple is good --- but let's do some validity checks */
1432 if (onerel->rd_rel->relhasoids &&
1433 !OidIsValid(HeapTupleGetOid(&tuple)))
1434 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
1435 relname, blkno, offnum);
1437 * The shrinkage phase of VACUUM FULL requires that all
1438 * live tuples have XMIN_COMMITTED set --- see comments in
1439 * repair_frag()'s walk-along-page loop. Use of async
1440 * commit may prevent HeapTupleSatisfiesVacuum from
1441 * setting the bit for a recently committed tuple. Rather
1442 * than trying to handle this corner case, we just give up
1446 !(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1449 (errmsg("relation \"%s\" TID %u/%u: XMIN_COMMITTED not set for transaction %u --- cannot shrink relation",
1450 relname, blkno, offnum,
1451 HeapTupleHeaderGetXmin(tuple.t_data))));
1452 do_shrinking = false;
1455 case HEAPTUPLE_DEAD:
1457 * Ordinarily, DEAD tuples would have been removed by
1458 * heap_page_prune(), but it's possible that the tuple
1459 * state changed since heap_page_prune() looked. In
1460 * particular an INSERT_IN_PROGRESS tuple could have
1461 * changed to DEAD if the inserter aborted. So this
1462 * cannot be considered an error condition, though it
1463 * does suggest that someone released a lock early.
1465 * If the tuple is HOT-updated then it must only be
1466 * removed by a prune operation; so we keep it as if it
1467 * were RECENTLY_DEAD, and abandon shrinking. (XXX is it
1468 * worth trying to make the shrinking code smart enough
1469 * to handle this? It's an unusual corner case.)
1471 * DEAD heap-only tuples can safely be removed if they
1472 * aren't themselves HOT-updated, although this is a bit
1473 * inefficient since we'll uselessly try to remove
1474 * index entries for them.
1476 if (HeapTupleIsHotUpdated(&tuple))
1481 (errmsg("relation \"%s\" TID %u/%u: dead HOT-updated tuple --- cannot shrink relation",
1482 relname, blkno, offnum)));
1483 do_shrinking = false;
1487 tupgone = true; /* we can delete the tuple */
1489 * We need not require XMIN_COMMITTED or
1490 * XMAX_COMMITTED to be set, since we will remove the
1491 * tuple without any further examination of its hint
1496 case HEAPTUPLE_RECENTLY_DEAD:
1499 * If tuple is recently deleted then we must not remove it
1505 * As with the LIVE case, shrinkage requires XMIN_COMMITTED
1509 !(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1512 (errmsg("relation \"%s\" TID %u/%u: XMIN_COMMITTED not set for transaction %u --- cannot shrink relation",
1513 relname, blkno, offnum,
1514 HeapTupleHeaderGetXmin(tuple.t_data))));
1515 do_shrinking = false;
1519 * If we do shrinking and this tuple is updated one then
1520 * remember it to construct updated tuple dependencies.
1523 !(ItemPointerEquals(&(tuple.t_self),
1524 &(tuple.t_data->t_ctid))))
1526 if (free_vtlinks == 0)
1528 free_vtlinks = 1000;
1529 vtlinks = (VTupleLink) repalloc(vtlinks,
1530 (free_vtlinks + num_vtlinks) *
1531 sizeof(VTupleLinkData));
1533 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
1534 vtlinks[num_vtlinks].this_tid = tuple.t_self;
1539 case HEAPTUPLE_INSERT_IN_PROGRESS:
1542 * This should not happen, since we hold exclusive lock on
1543 * the relation; shouldn't we raise an error? (Actually,
1544 * it can happen in system catalogs, since we tend to
1545 * release write lock before commit there.) As above,
1546 * we can't apply repair_frag() if the tuple state is
1551 (errmsg("relation \"%s\" TID %u/%u: InsertTransactionInProgress %u --- cannot shrink relation",
1552 relname, blkno, offnum,
1553 HeapTupleHeaderGetXmin(tuple.t_data))));
1554 do_shrinking = false;
1556 case HEAPTUPLE_DELETE_IN_PROGRESS:
1559 * This should not happen, since we hold exclusive lock on
1560 * the relation; shouldn't we raise an error? (Actually,
1561 * it can happen in system catalogs, since we tend to
1562 * release write lock before commit there.) As above,
1563 * we can't apply repair_frag() if the tuple state is
1568 (errmsg("relation \"%s\" TID %u/%u: DeleteTransactionInProgress %u --- cannot shrink relation",
1569 relname, blkno, offnum,
1570 HeapTupleHeaderGetXmax(tuple.t_data))));
1571 do_shrinking = false;
1574 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1583 * Here we are building a temporary copy of the page with dead
1584 * tuples removed. Below we will apply
1585 * PageRepairFragmentation to the copy, so that we can
1586 * determine how much space will be available after removal of
1587 * dead tuples. But note we are NOT changing the real page
1590 if (tempPage == NULL)
1594 pageSize = PageGetPageSize(page);
1595 tempPage = (Page) palloc(pageSize);
1596 memcpy(tempPage, page, pageSize);
1599 /* mark it unused on the temp page */
1600 lpp = PageGetItemId(tempPage, offnum);
1601 ItemIdSetUnused(lpp);
1603 vacpage->offsets[vacpage->offsets_free++] = offnum;
1609 if (!HeapTupleIsHeapOnly(&tuple))
1610 num_indexed_tuples += 1;
1612 if (tuple.t_len < min_tlen)
1613 min_tlen = tuple.t_len;
1614 if (tuple.t_len > max_tlen)
1615 max_tlen = tuple.t_len;
1618 * Each non-removable tuple must be checked to see if it
1621 if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
1623 frozen[nfrozen++] = offnum;
1625 } /* scan along page */
1627 if (tempPage != NULL)
1629 /* Some tuples are removable; figure free space after removal */
1630 PageRepairFragmentation(tempPage);
1631 vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage);
1637 /* Just use current available space */
1638 vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
1639 /* Need to reap the page if it has UNUSED or DEAD line pointers */
1640 do_reap = (vacpage->offsets_free > 0);
1643 free_space += vacpage->free;
1646 * Add the page to fraged_pages if it has a useful amount of free
1647 * space. "Useful" means enough for a minimal-sized tuple. But we
1648 * don't know that accurately near the start of the relation, so add
1649 * pages unconditionally if they have >= BLCKSZ/10 free space.
1651 do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1653 if (do_reap || do_frag)
1655 VacPage vacpagecopy = copy_vac_page(vacpage);
1658 vpage_insert(vacuum_pages, vacpagecopy);
1660 vpage_insert(fraged_pages, vacpagecopy);
1664 * Include the page in empty_end_pages if it will be empty after
1665 * vacuuming; this is to keep us from using it as a move destination.
1673 empty_end_pages = 0;
1676 * If we froze any tuples, mark the buffer dirty, and write a WAL
1677 * record recording the changes. We must log the changes to be
1678 * crash-safe against future truncation of CLOG.
1682 MarkBufferDirty(buf);
1683 /* no XLOG for temp tables, though */
1684 if (!onerel->rd_istemp)
1688 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
1690 PageSetLSN(page, recptr);
1691 PageSetTLI(page, ThisTimeLineID);
1695 UnlockReleaseBuffer(buf);
1700 /* save stats in the rel list for use later */
1701 vacrelstats->rel_tuples = num_tuples;
1702 vacrelstats->rel_indexed_tuples = num_indexed_tuples;
1703 vacrelstats->rel_pages = nblocks;
1704 if (num_tuples == 0)
1705 min_tlen = max_tlen = 0;
1706 vacrelstats->min_tlen = min_tlen;
1707 vacrelstats->max_tlen = max_tlen;
1709 vacuum_pages->empty_end_pages = empty_end_pages;
1710 fraged_pages->empty_end_pages = empty_end_pages;
1713 * Clear the fraged_pages list if we found we couldn't shrink. Else,
1714 * remove any "empty" end-pages from the list, and compute usable free
1715 * space = free space in remaining pages.
1721 Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
1722 fraged_pages->num_pages -= empty_end_pages;
1723 usable_free_space = 0;
1724 for (i = 0; i < fraged_pages->num_pages; i++)
1725 usable_free_space += fraged_pages->pagedesc[i]->free;
1729 fraged_pages->num_pages = 0;
1730 usable_free_space = 0;
1733 /* don't bother to save vtlinks if we will not call repair_frag */
1734 if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
1736 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
1738 vacrelstats->vtlinks = vtlinks;
1739 vacrelstats->num_vtlinks = num_vtlinks;
1743 vacrelstats->vtlinks = NULL;
1744 vacrelstats->num_vtlinks = 0;
1749 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
1750 RelationGetRelationName(onerel),
1751 tups_vacuumed, num_tuples, nblocks),
1752 errdetail("%.0f dead row versions cannot be removed yet.\n"
1753 "Nonremovable row versions range from %lu to %lu bytes long.\n"
1754 "There were %.0f unused item pointers.\n"
1755 "Total free space (including removable row versions) is %.0f bytes.\n"
1756 "%u pages are or will become empty, including %u at the end of the table.\n"
1757 "%u pages containing %.0f free bytes are potential move destinations.\n"
1760 (unsigned long) min_tlen, (unsigned long) max_tlen,
1763 empty_pages, empty_end_pages,
1764 fraged_pages->num_pages, usable_free_space,
1765 pg_rusage_show(&ru0))));
1770 * repair_frag() -- try to repair relation's fragmentation
1772 * This routine marks dead tuples as unused and tries re-use dead space
1773 * by moving tuples (and inserting indexes if needed). It constructs
1774 * Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1775 * for them after committing (in hack-manner - without losing locks
1776 * and freeing memory!) current transaction. It truncates relation
1777 * if some end-blocks are gone away.
1780 repair_frag(VRelStats *vacrelstats, Relation onerel,
1781 VacPageList vacuum_pages, VacPageList fraged_pages,
1782 int nindexes, Relation *Irel)
1784 TransactionId myXID = GetCurrentTransactionId();
1785 Buffer dst_buffer = InvalidBuffer;
1786 BlockNumber nblocks,
1788 BlockNumber last_move_dest_block = 0,
1790 Page dst_page = NULL;
1792 VacPageListData Nvacpagelist;
1793 VacPage dst_vacpage = NULL,
1801 int keep_tuples = 0;
1802 int keep_indexed_tuples = 0;
1805 pg_rusage_init(&ru0);
1807 ExecContext_Init(&ec, onerel);
1809 Nvacpagelist.num_pages = 0;
1810 num_fraged_pages = fraged_pages->num_pages;
1811 Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
1812 vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1813 if (vacuumed_pages > 0)
1815 /* get last reaped page from vacuum_pages */
1816 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1817 last_vacuum_block = last_vacuum_page->blkno;
1821 last_vacuum_page = NULL;
1822 last_vacuum_block = InvalidBlockNumber;
1825 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1826 vacpage->offsets_used = vacpage->offsets_free = 0;
1829 * Scan pages backwards from the last nonempty page, trying to move tuples
1830 * down to lower pages. Quit when we reach a page that we have moved any
1831 * tuples onto, or the first page if we haven't moved anything, or when we
1832 * find a page we cannot completely empty (this last condition is handled
1833 * by "break" statements within the loop).
1835 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1836 * in order by blkno.
1838 nblocks = vacrelstats->rel_pages;
1839 for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1840 blkno > last_move_dest_block;
1845 OffsetNumber offnum,
1850 vacuum_delay_point();
1853 * Forget fraged_pages pages at or after this one; they're no longer
1854 * useful as move targets, since we only want to move down. Note that
1855 * since we stop the outer loop at last_move_dest_block, pages removed
1856 * here cannot have had anything moved onto them already.
1858 * Also note that we don't change the stored fraged_pages list, only
1859 * our local variable num_fraged_pages; so the forgotten pages are
1860 * still available to be loaded into the free space map later.
1862 while (num_fraged_pages > 0 &&
1863 fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1865 Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1870 * Process this page of relation.
1872 buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
1873 page = BufferGetPage(buf);
1875 vacpage->offsets_free = 0;
1877 isempty = PageIsEmpty(page);
1879 /* Is the page in the vacuum_pages list? */
1880 if (blkno == last_vacuum_block)
1882 if (last_vacuum_page->offsets_free > 0)
1884 /* there are dead tuples on this page - clean them */
1886 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1887 vacuum_page(onerel, buf, last_vacuum_page);
1888 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1893 if (vacuumed_pages > 0)
1895 /* get prev reaped page from vacuum_pages */
1896 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1897 last_vacuum_block = last_vacuum_page->blkno;
1901 last_vacuum_page = NULL;
1902 last_vacuum_block = InvalidBlockNumber;
1913 chain_tuple_moved = false; /* no one chain-tuple was moved off
1915 vacpage->blkno = blkno;
1916 maxoff = PageGetMaxOffsetNumber(page);
1917 for (offnum = FirstOffsetNumber;
1919 offnum = OffsetNumberNext(offnum))
1922 HeapTupleData tuple;
1923 ItemId itemid = PageGetItemId(page, offnum);
1925 if (!ItemIdIsUsed(itemid))
1928 if (ItemIdIsDead(itemid))
1930 /* just remember it for vacuum_page() */
1931 vacpage->offsets[vacpage->offsets_free++] = offnum;
1935 /* Shouldn't have any redirected items now */
1936 Assert(ItemIdIsNormal(itemid));
1938 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1939 tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1940 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1943 * VACUUM FULL has an exclusive lock on the relation. So
1944 * normally no other transaction can have pending INSERTs or
1945 * DELETEs in this relation. A tuple is either:
1946 * (a) live (XMIN_COMMITTED)
1947 * (b) known dead (XMIN_INVALID, or XMAX_COMMITTED and xmax
1948 * is visible to all active transactions)
1949 * (c) inserted and deleted (XMIN_COMMITTED+XMAX_COMMITTED)
1950 * but at least one active transaction does not see the
1951 * deleting transaction (ie, it's RECENTLY_DEAD)
1952 * (d) moved by the currently running VACUUM
1953 * (e) inserted or deleted by a not yet committed transaction,
1954 * or by a transaction we couldn't set XMIN_COMMITTED for.
1955 * In case (e) we wouldn't be in repair_frag() at all, because
1956 * scan_heap() detects those cases and shuts off shrinking.
1957 * We can't see case (b) here either, because such tuples were
1958 * already removed by vacuum_page(). Cases (a) and (c) are
1959 * normal and will have XMIN_COMMITTED set. Case (d) is only
1960 * possible if a whole tuple chain has been moved while
1961 * processing this or a higher numbered block.
1964 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1966 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1967 elog(ERROR, "HEAP_MOVED_IN was not expected");
1968 if (!(tuple.t_data->t_infomask & HEAP_MOVED_OFF))
1969 elog(ERROR, "HEAP_MOVED_OFF was expected");
1972 * MOVED_OFF by another VACUUM would have caused the
1973 * visibility check to set XMIN_COMMITTED or XMIN_INVALID.
1975 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
1976 elog(ERROR, "invalid XVAC in tuple header");
1979 * If this (chain) tuple is moved by me already then I have to
1980 * check is it in vacpage or not - i.e. is it moved while
1981 * cleaning this page or some previous one.
1984 /* Can't we Assert(keep_tuples > 0) here? */
1985 if (keep_tuples == 0)
1987 if (chain_tuple_moved)
1989 /* some chains were moved while cleaning this page */
1990 Assert(vacpage->offsets_free > 0);
1991 for (i = 0; i < vacpage->offsets_free; i++)
1993 if (vacpage->offsets[i] == offnum)
1996 if (i >= vacpage->offsets_free) /* not found */
1998 vacpage->offsets[vacpage->offsets_free++] = offnum;
2000 * If this is not a heap-only tuple, there must be an
2001 * index entry for this item which will be removed in
2002 * the index cleanup. Decrement the keep_indexed_tuples
2003 * count to remember this.
2005 if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
2006 keep_indexed_tuples--;
2012 vacpage->offsets[vacpage->offsets_free++] = offnum;
2014 * If this is not a heap-only tuple, there must be an
2015 * index entry for this item which will be removed in
2016 * the index cleanup. Decrement the keep_indexed_tuples
2017 * count to remember this.
2019 if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
2020 keep_indexed_tuples--;
2027 * If this tuple is in a chain of tuples created in updates by
2028 * "recent" transactions then we have to move the whole chain of
2029 * tuples to other places, so that we can write new t_ctid links
2030 * that preserve the chain relationship.
2032 * This test is complicated. Read it as "if tuple is a recently
2033 * created updated version, OR if it is an obsoleted version". (In
2034 * the second half of the test, we needn't make any check on XMAX
2035 * --- it must be recently obsoleted, else scan_heap would have
2036 * deemed it removable.)
2038 * NOTE: this test is not 100% accurate: it is possible for a
2039 * tuple to be an updated one with recent xmin, and yet not match
2040 * any new_tid entry in the vtlinks list. Presumably there was
2041 * once a parent tuple with xmax matching the xmin, but it's
2042 * possible that that tuple has been removed --- for example, if
2043 * it had xmin = xmax and wasn't itself an updated version, then
2044 * HeapTupleSatisfiesVacuum would deem it removable as soon as the
2045 * xmin xact completes.
2047 * To be on the safe side, we abandon the repair_frag process if
2048 * we cannot find the parent tuple in vtlinks. This may be overly
2049 * conservative; AFAICS it would be safe to move the chain.
2051 * Also, because we distinguish DEAD and RECENTLY_DEAD tuples
2052 * using OldestXmin, which is a rather coarse test, it is quite
2053 * possible to have an update chain in which a tuple we think is
2054 * RECENTLY_DEAD links forward to one that is definitely DEAD.
2055 * In such a case the RECENTLY_DEAD tuple must actually be dead,
2056 * but it seems too complicated to try to make VACUUM remove it.
2057 * We treat each contiguous set of RECENTLY_DEAD tuples as a
2058 * separately movable chain, ignoring any intervening DEAD ones.
2060 if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
2061 !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
2063 (!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
2065 !(ItemPointerEquals(&(tuple.t_self),
2066 &(tuple.t_data->t_ctid)))))
2069 bool freeCbuf = false;
2070 bool chain_move_failed = false;
2071 bool moved_target = false;
2072 ItemPointerData Ctid;
2073 HeapTupleData tp = tuple;
2074 Size tlen = tuple_len;
2078 VacPage to_vacpage = NULL;
2082 if (dst_buffer != InvalidBuffer)
2084 ReleaseBuffer(dst_buffer);
2085 dst_buffer = InvalidBuffer;
2088 /* Quick exit if we have no vtlinks to search in */
2089 if (vacrelstats->vtlinks == NULL)
2091 elog(DEBUG2, "parent item in update-chain not found --- cannot continue repair_frag");
2092 break; /* out of walk-along-page loop */
2096 * If this tuple is in the begin/middle of the chain then we
2097 * have to move to the end of chain. As with any t_ctid
2098 * chase, we have to verify that each new tuple is really the
2099 * descendant of the tuple we came from; however, here we
2100 * need even more than the normal amount of paranoia.
2101 * If t_ctid links forward to a tuple determined to be DEAD,
2102 * then depending on where that tuple is, it might already
2103 * have been removed, and perhaps even replaced by a MOVED_IN
2104 * tuple. We don't want to include any DEAD tuples in the
2105 * chain, so we have to recheck HeapTupleSatisfiesVacuum.
2107 while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
2109 !(ItemPointerEquals(&(tp.t_self),
2110 &(tp.t_data->t_ctid))))
2112 ItemPointerData nextTid;
2113 TransactionId priorXmax;
2116 OffsetNumber nextOffnum;
2118 HeapTupleHeader nextTdata;
2119 HTSV_Result nextTstatus;
2121 nextTid = tp.t_data->t_ctid;
2122 priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
2123 /* assume block# is OK (see heap_fetch comments) */
2124 nextBuf = ReadBufferWithStrategy(onerel,
2125 ItemPointerGetBlockNumber(&nextTid),
2127 nextPage = BufferGetPage(nextBuf);
2128 /* If bogus or unused slot, assume tp is end of chain */
2129 nextOffnum = ItemPointerGetOffsetNumber(&nextTid);
2130 if (nextOffnum < FirstOffsetNumber ||
2131 nextOffnum > PageGetMaxOffsetNumber(nextPage))
2133 ReleaseBuffer(nextBuf);
2136 nextItemid = PageGetItemId(nextPage, nextOffnum);
2137 if (!ItemIdIsNormal(nextItemid))
2139 ReleaseBuffer(nextBuf);
2142 /* if not matching XMIN, assume tp is end of chain */
2143 nextTdata = (HeapTupleHeader) PageGetItem(nextPage,
2145 if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata),
2148 ReleaseBuffer(nextBuf);
2152 * Must check for DEAD or MOVED_IN tuple, too. This
2153 * could potentially update hint bits, so we'd better
2154 * hold the buffer content lock.
2156 LockBuffer(nextBuf, BUFFER_LOCK_SHARE);
2157 nextTstatus = HeapTupleSatisfiesVacuum(nextTdata,
2160 if (nextTstatus == HEAPTUPLE_DEAD ||
2161 nextTstatus == HEAPTUPLE_INSERT_IN_PROGRESS)
2163 UnlockReleaseBuffer(nextBuf);
2166 LockBuffer(nextBuf, BUFFER_LOCK_UNLOCK);
2167 /* if it's MOVED_OFF we shoulda moved this one with it */
2168 if (nextTstatus == HEAPTUPLE_DELETE_IN_PROGRESS)
2169 elog(ERROR, "updated tuple is already HEAP_MOVED_OFF");
2170 /* OK, switch our attention to the next tuple in chain */
2171 tp.t_data = nextTdata;
2172 tp.t_self = nextTid;
2173 tlen = tp.t_len = ItemIdGetLength(nextItemid);
2175 ReleaseBuffer(Cbuf);
2180 /* Set up workspace for planning the chain move */
2181 vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
2186 * Now, walk backwards up the chain (towards older tuples) and
2187 * check if all items in chain can be moved. We record all
2188 * the moves that need to be made in the vtmove array.
2195 HeapTupleHeader PTdata;
2196 VTupleLinkData vtld,
2199 /* Identify a target page to move this tuple to */
2200 if (to_vacpage == NULL ||
2201 !enough_space(to_vacpage, tlen))
2203 for (i = 0; i < num_fraged_pages; i++)
2205 if (enough_space(fraged_pages->pagedesc[i], tlen))
2209 if (i == num_fraged_pages)
2211 /* can't move item anywhere */
2212 chain_move_failed = true;
2213 break; /* out of check-all-items loop */
2216 to_vacpage = fraged_pages->pagedesc[to_item];
2218 to_vacpage->free -= MAXALIGN(tlen);
2219 if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
2220 to_vacpage->free -= sizeof(ItemIdData);
2221 (to_vacpage->offsets_used)++;
2223 /* Add an entry to vtmove list */
2224 if (free_vtmove == 0)
2227 vtmove = (VTupleMove)
2229 (free_vtmove + num_vtmove) *
2230 sizeof(VTupleMoveData));
2232 vtmove[num_vtmove].tid = tp.t_self;
2233 vtmove[num_vtmove].vacpage = to_vacpage;
2234 if (to_vacpage->offsets_used == 1)
2235 vtmove[num_vtmove].cleanVpd = true;
2237 vtmove[num_vtmove].cleanVpd = false;
2241 /* Remember if we reached the original target tuple */
2242 if (ItemPointerGetBlockNumber(&tp.t_self) == blkno &&
2243 ItemPointerGetOffsetNumber(&tp.t_self) == offnum)
2244 moved_target = true;
2246 /* Done if at beginning of chain */
2247 if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
2248 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
2250 break; /* out of check-all-items loop */
2252 /* Move to tuple with prior row version */
2253 vtld.new_tid = tp.t_self;
2255 vac_bsearch((void *) &vtld,
2256 (void *) (vacrelstats->vtlinks),
2257 vacrelstats->num_vtlinks,
2258 sizeof(VTupleLinkData),
2262 /* see discussion above */
2263 elog(DEBUG2, "parent item in update-chain not found --- cannot continue repair_frag");
2264 chain_move_failed = true;
2265 break; /* out of check-all-items loop */
2267 tp.t_self = vtlp->this_tid;
2268 Pbuf = ReadBufferWithStrategy(onerel,
2269 ItemPointerGetBlockNumber(&(tp.t_self)),
2271 Ppage = BufferGetPage(Pbuf);
2272 Pitemid = PageGetItemId(Ppage,
2273 ItemPointerGetOffsetNumber(&(tp.t_self)));
2274 /* this can't happen since we saw tuple earlier: */
2275 if (!ItemIdIsNormal(Pitemid))
2276 elog(ERROR, "parent itemid marked as unused");
2277 PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
2279 /* ctid should not have changed since we saved it */
2280 Assert(ItemPointerEquals(&(vtld.new_tid),
2281 &(PTdata->t_ctid)));
2284 * Read above about cases when !ItemIdIsUsed(nextItemid)
2285 * (child item is removed)... Due to the fact that at the
2286 * moment we don't remove unuseful part of update-chain,
2287 * it's possible to get non-matching parent row here. Like
2288 * as in the case which caused this problem, we stop
2289 * shrinking here. I could try to find real parent row but
2290 * want not to do it because of real solution will be
2291 * implemented anyway, later, and we are too close to 6.5
2292 * release. - vadim 06/11/99
2294 if ((PTdata->t_infomask & HEAP_XMAX_IS_MULTI) ||
2295 !(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata),
2296 HeapTupleHeaderGetXmin(tp.t_data))))
2298 ReleaseBuffer(Pbuf);
2299 elog(DEBUG2, "too old parent tuple found --- cannot continue repair_frag");
2300 chain_move_failed = true;
2301 break; /* out of check-all-items loop */
2304 tlen = tp.t_len = ItemIdGetLength(Pitemid);
2306 ReleaseBuffer(Cbuf);
2309 } /* end of check-all-items loop */
2312 ReleaseBuffer(Cbuf);
2315 /* Double-check that we will move the current target tuple */
2316 if (!moved_target && !chain_move_failed)
2318 elog(DEBUG2, "failed to chain back to target --- cannot continue repair_frag");
2319 chain_move_failed = true;
2322 if (chain_move_failed)
2325 * Undo changes to offsets_used state. We don't bother
2326 * cleaning up the amount-free state, since we're not
2327 * going to do any further tuple motion.
2329 for (i = 0; i < num_vtmove; i++)
2331 Assert(vtmove[i].vacpage->offsets_used > 0);
2332 (vtmove[i].vacpage->offsets_used)--;
2335 break; /* out of walk-along-page loop */
2339 * Okay, move the whole tuple chain in reverse order.
2341 * Ctid tracks the new location of the previously-moved tuple.
2343 ItemPointerSetInvalid(&Ctid);
2344 for (ti = 0; ti < num_vtmove; ti++)
2346 VacPage destvacpage = vtmove[ti].vacpage;
2350 /* Get page to move from */
2351 tuple.t_self = vtmove[ti].tid;
2352 Cbuf = ReadBufferWithStrategy(onerel,
2353 ItemPointerGetBlockNumber(&(tuple.t_self)),
2356 /* Get page to move to */
2357 dst_buffer = ReadBufferWithStrategy(onerel,
2361 LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
2362 if (dst_buffer != Cbuf)
2363 LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
2365 dst_page = BufferGetPage(dst_buffer);
2366 Cpage = BufferGetPage(Cbuf);
2368 Citemid = PageGetItemId(Cpage,
2369 ItemPointerGetOffsetNumber(&(tuple.t_self)));
2370 tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
2371 tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
2373 move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
2374 dst_buffer, dst_page, destvacpage,
2375 &ec, &Ctid, vtmove[ti].cleanVpd);
2378 * If the tuple we are moving is a heap-only tuple,
2379 * this move will generate an additional index entry,
2380 * so increment the rel_indexed_tuples count.
2382 if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
2383 vacrelstats->rel_indexed_tuples++;
2386 if (destvacpage->blkno > last_move_dest_block)
2387 last_move_dest_block = destvacpage->blkno;
2390 * Remember that we moved tuple from the current page
2391 * (corresponding index tuple will be cleaned).
2394 vacpage->offsets[vacpage->offsets_free++] =
2395 ItemPointerGetOffsetNumber(&(tuple.t_self));
2399 * When we move tuple chains, we may need to move
2400 * tuples from a block that we haven't yet scanned in
2401 * the outer walk-along-the-relation loop. Note that we
2402 * can't be moving a tuple from a block that we have
2403 * already scanned because if such a tuple exists, then
2404 * we must have moved the chain along with that tuple
2405 * when we scanned that block. IOW the test of
2406 * (Cbuf != buf) guarantees that the tuple we are
2407 * looking at right now is in a block which is yet to
2410 * We maintain two counters to correctly count the
2411 * moved-off tuples from blocks that are not yet
2412 * scanned (keep_tuples) and how many of them have
2413 * index pointers (keep_indexed_tuples). The main
2414 * reason to track the latter is to help verify
2415 * that indexes have the expected number of entries
2416 * when all the dust settles.
2418 if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
2419 keep_indexed_tuples++;
2423 ReleaseBuffer(dst_buffer);
2424 ReleaseBuffer(Cbuf);
2425 } /* end of move-the-tuple-chain loop */
2427 dst_buffer = InvalidBuffer;
2429 chain_tuple_moved = true;
2431 /* advance to next tuple in walk-along-page loop */
2433 } /* end of is-tuple-in-chain test */
2435 /* try to find new page for this tuple */
2436 if (dst_buffer == InvalidBuffer ||
2437 !enough_space(dst_vacpage, tuple_len))
2439 if (dst_buffer != InvalidBuffer)
2441 ReleaseBuffer(dst_buffer);
2442 dst_buffer = InvalidBuffer;
2444 for (i = 0; i < num_fraged_pages; i++)
2446 if (enough_space(fraged_pages->pagedesc[i], tuple_len))
2449 if (i == num_fraged_pages)
2450 break; /* can't move item anywhere */
2451 dst_vacpage = fraged_pages->pagedesc[i];
2452 dst_buffer = ReadBufferWithStrategy(onerel,
2455 LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
2456 dst_page = BufferGetPage(dst_buffer);
2457 /* if this page was not used before - clean it */
2458 if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
2459 vacuum_page(onerel, dst_buffer, dst_vacpage);
2462 LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
2464 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2466 move_plain_tuple(onerel, buf, page, &tuple,
2467 dst_buffer, dst_page, dst_vacpage, &ec);
2470 * If the tuple we are moving is a heap-only tuple,
2471 * this move will generate an additional index entry,
2472 * so increment the rel_indexed_tuples count.
2474 if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
2475 vacrelstats->rel_indexed_tuples++;
2478 if (dst_vacpage->blkno > last_move_dest_block)
2479 last_move_dest_block = dst_vacpage->blkno;
2482 * Remember that we moved tuple from the current page
2483 * (corresponding index tuple will be cleaned).
2485 vacpage->offsets[vacpage->offsets_free++] = offnum;
2486 } /* walk along page */
2489 * If we broke out of the walk-along-page loop early (ie, still have
2490 * offnum <= maxoff), then we failed to move some tuple off this page.
2491 * No point in shrinking any more, so clean up and exit the per-page
2494 if (offnum < maxoff && keep_tuples > 0)
2499 * Fix vacpage state for any unvisited tuples remaining on page
2501 for (off = OffsetNumberNext(offnum);
2503 off = OffsetNumberNext(off))
2505 ItemId itemid = PageGetItemId(page, off);
2506 HeapTupleHeader htup;
2508 if (!ItemIdIsUsed(itemid))
2510 /* Shouldn't be any DEAD or REDIRECT items anymore */
2511 Assert(ItemIdIsNormal(itemid));
2513 htup = (HeapTupleHeader) PageGetItem(page, itemid);
2514 if (htup->t_infomask & HEAP_XMIN_COMMITTED)
2518 * See comments in the walk-along-page loop above about why
2519 * only MOVED_OFF tuples should be found here.
2521 if (htup->t_infomask & HEAP_MOVED_IN)
2522 elog(ERROR, "HEAP_MOVED_IN was not expected");
2523 if (!(htup->t_infomask & HEAP_MOVED_OFF))
2524 elog(ERROR, "HEAP_MOVED_OFF was expected");
2525 if (HeapTupleHeaderGetXvac(htup) != myXID)
2526 elog(ERROR, "invalid XVAC in tuple header");
2528 if (chain_tuple_moved)
2530 /* some chains were moved while cleaning this page */
2531 Assert(vacpage->offsets_free > 0);
2532 for (i = 0; i < vacpage->offsets_free; i++)
2534 if (vacpage->offsets[i] == off)
2537 if (i >= vacpage->offsets_free) /* not found */
2539 vacpage->offsets[vacpage->offsets_free++] = off;
2540 Assert(keep_tuples > 0);
2542 * If this is not a heap-only tuple, there must be an
2543 * index entry for this item which will be removed in
2544 * the index cleanup. Decrement the keep_indexed_tuples
2545 * count to remember this.
2547 if (!HeapTupleHeaderIsHeapOnly(htup))
2548 keep_indexed_tuples--;
2554 vacpage->offsets[vacpage->offsets_free++] = off;
2555 Assert(keep_tuples > 0);
2556 if (!HeapTupleHeaderIsHeapOnly(htup))
2557 keep_indexed_tuples--;
2563 if (vacpage->offsets_free > 0) /* some tuples were moved */
2565 if (chain_tuple_moved) /* else - they are ordered */
2567 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
2568 sizeof(OffsetNumber), vac_cmp_offno);
2570 vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2575 if (offnum <= maxoff)
2576 break; /* had to quit early, see above note */
2578 } /* walk along relation */
2580 blkno++; /* new number of blocks */
2582 if (dst_buffer != InvalidBuffer)
2584 Assert(num_moved > 0);
2585 ReleaseBuffer(dst_buffer);
2591 * We have to commit our tuple movings before we truncate the
2592 * relation. Ideally we should do Commit/StartTransactionCommand
2593 * here, relying on the session-level table lock to protect our
2594 * exclusive access to the relation. However, that would require a
2595 * lot of extra code to close and re-open the relation, indexes, etc.
2596 * For now, a quick hack: record status of current transaction as
2597 * committed, and continue. We force the commit to be synchronous
2598 * so that it's down to disk before we truncate. (Note: tqual.c
2599 * knows that VACUUM FULL always uses sync commit, too.) The
2600 * transaction continues to be shown as running in the ProcArray.
2602 * XXX This desperately needs to be revisited. Any failure after
2603 * this point will result in a PANIC "cannot abort transaction nnn,
2604 * it was already committed"!
2607 (void) RecordTransactionCommit();
2611 * We are not going to move any more tuples across pages, but we still
2612 * need to apply vacuum_page to compact free space in the remaining pages
2613 * in vacuum_pages list. Note that some of these pages may also be in the
2614 * fraged_pages list, and may have had tuples moved onto them; if so, we
2615 * already did vacuum_page and needn't do it again.
2617 for (i = 0, curpage = vacuum_pages->pagedesc;
2621 vacuum_delay_point();
2623 Assert((*curpage)->blkno < blkno);
2624 if ((*curpage)->offsets_used == 0)
2629 /* this page was not used as a move target, so must clean it */
2630 buf = ReadBufferWithStrategy(onerel,
2633 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2634 page = BufferGetPage(buf);
2635 if (!PageIsEmpty(page))
2636 vacuum_page(onerel, buf, *curpage);
2637 UnlockReleaseBuffer(buf);
2642 * Now scan all the pages that we moved tuples onto and update tuple
2643 * status bits. This is not really necessary, but will save time for
2644 * future transactions examining these tuples.
2646 update_hint_bits(onerel, fraged_pages, num_fraged_pages,
2647 last_move_dest_block, num_moved);
2650 * It'd be cleaner to make this report at the bottom of this routine, but
2651 * then the rusage would double-count the second pass of index vacuuming.
2652 * So do it here and ignore the relatively small amount of processing that
2656 (errmsg("\"%s\": moved %u row versions, truncated %u to %u pages",
2657 RelationGetRelationName(onerel),
2658 num_moved, nblocks, blkno),
2660 pg_rusage_show(&ru0))));
2663 * Reflect the motion of system tuples to catalog cache here.
2665 CommandCounterIncrement();
2667 if (Nvacpagelist.num_pages > 0)
2669 /* vacuum indexes again if needed */
2676 /* re-sort Nvacpagelist.pagedesc */
2677 for (vpleft = Nvacpagelist.pagedesc,
2678 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2679 vpleft < vpright; vpleft++, vpright--)
2687 * keep_tuples is the number of tuples that have been moved off a
2688 * page during chain moves but not been scanned over subsequently.
2689 * The tuple ids of these tuples are not recorded as free offsets
2690 * for any VacPage, so they will not be cleared from the indexes.
2691 * keep_indexed_tuples is the portion of these that are expected
2692 * to have index entries.
2694 Assert(keep_tuples >= 0);
2695 for (i = 0; i < nindexes; i++)
2696 vacuum_index(&Nvacpagelist, Irel[i],
2697 vacrelstats->rel_indexed_tuples,
2698 keep_indexed_tuples);
2702 * Clean moved-off tuples from last page in Nvacpagelist list.
2704 * We need only do this in this one page, because higher-numbered
2705 * pages are going to be truncated from the relation entirely. But see
2706 * comments for update_hint_bits().
2708 if (vacpage->blkno == (blkno - 1) &&
2709 vacpage->offsets_free > 0)
2713 OffsetNumber unused[MaxOffsetNumber];
2714 OffsetNumber offnum,
2719 buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
2720 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2721 page = BufferGetPage(buf);
2722 maxoff = PageGetMaxOffsetNumber(page);
2723 for (offnum = FirstOffsetNumber;
2725 offnum = OffsetNumberNext(offnum))
2727 ItemId itemid = PageGetItemId(page, offnum);
2728 HeapTupleHeader htup;
2730 if (!ItemIdIsUsed(itemid))
2732 /* Shouldn't be any DEAD or REDIRECT items anymore */
2733 Assert(ItemIdIsNormal(itemid));
2735 htup = (HeapTupleHeader) PageGetItem(page, itemid);
2736 if (htup->t_infomask & HEAP_XMIN_COMMITTED)
2740 * See comments in the walk-along-page loop above about why
2741 * only MOVED_OFF tuples should be found here.
2743 if (htup->t_infomask & HEAP_MOVED_IN)
2744 elog(ERROR, "HEAP_MOVED_IN was not expected");
2745 if (!(htup->t_infomask & HEAP_MOVED_OFF))
2746 elog(ERROR, "HEAP_MOVED_OFF was expected");
2747 if (HeapTupleHeaderGetXvac(htup) != myXID)
2748 elog(ERROR, "invalid XVAC in tuple header");
2750 ItemIdSetUnused(itemid);
2753 unused[uncnt++] = offnum;
2755 Assert(vacpage->offsets_free == num_tuples);
2757 START_CRIT_SECTION();
2759 PageRepairFragmentation(page);
2761 MarkBufferDirty(buf);
2764 if (!onerel->rd_istemp)
2768 recptr = log_heap_clean(onerel, buf,
2772 PageSetLSN(page, recptr);
2773 PageSetTLI(page, ThisTimeLineID);
2778 UnlockReleaseBuffer(buf);
2781 /* now - free new list of reaped pages */
2782 curpage = Nvacpagelist.pagedesc;
2783 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
2785 pfree(Nvacpagelist.pagedesc);
2788 /* Truncate relation, if needed */
2789 if (blkno < nblocks)
2791 RelationTruncate(onerel, blkno);
2792 vacrelstats->rel_pages = blkno; /* set new number of blocks */
2797 if (vacrelstats->vtlinks != NULL)
2798 pfree(vacrelstats->vtlinks);
2800 ExecContext_Finish(&ec);
2804 * move_chain_tuple() -- move one tuple that is part of a tuple chain
2806 * This routine moves old_tup from old_page to dst_page.
2807 * old_page and dst_page might be the same page.
2808 * On entry old_buf and dst_buf are locked exclusively, both locks (or
2809 * the single lock, if this is a intra-page-move) are released before
2812 * Yes, a routine with ten parameters is ugly, but it's still better
2813 * than having these 120 lines of code in repair_frag() which is
2814 * already too long and almost unreadable.
2817 move_chain_tuple(Relation rel,
2818 Buffer old_buf, Page old_page, HeapTuple old_tup,
2819 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
2820 ExecContext ec, ItemPointer ctid, bool cleanVpd)
2822 TransactionId myXID = GetCurrentTransactionId();
2823 HeapTupleData newtup;
2824 OffsetNumber newoff;
2826 Size tuple_len = old_tup->t_len;
2829 * make a modifiable copy of the source tuple.
2831 heap_copytuple_with_tuple(old_tup, &newtup);
2834 * register invalidation of source tuple in catcaches.
2836 CacheInvalidateHeapTuple(rel, old_tup);
2838 /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
2839 START_CRIT_SECTION();
2842 * mark the source tuple MOVED_OFF.
2844 old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2847 old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
2848 HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
2851 * If this page was not used before - clean it.
2853 * NOTE: a nasty bug used to lurk here. It is possible for the source and
2854 * destination pages to be the same (since this tuple-chain member can be
2855 * on a page lower than the one we're currently processing in the outer
2856 * loop). If that's true, then after vacuum_page() the source tuple will
2857 * have been moved, and tuple.t_data will be pointing at garbage.
2858 * Therefore we must do everything that uses old_tup->t_data BEFORE this
2861 * This path is different from the other callers of vacuum_page, because
2862 * we have already incremented the vacpage's offsets_used field to account
2863 * for the tuple(s) we expect to move onto the page. Therefore
2864 * vacuum_page's check for offsets_used == 0 is wrong. But since that's a
2865 * good debugging check for all other callers, we work around it here
2866 * rather than remove it.
2868 if (!PageIsEmpty(dst_page) && cleanVpd)
2870 int sv_offsets_used = dst_vacpage->offsets_used;
2872 dst_vacpage->offsets_used = 0;
2873 vacuum_page(rel, dst_buf, dst_vacpage);
2874 dst_vacpage->offsets_used = sv_offsets_used;
2878 * Update the state of the copied tuple, and store it on the destination
2879 * page. The copied tuple is never part of a HOT chain.
2881 newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2884 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2885 HeapTupleHeaderClearHotUpdated(newtup.t_data);
2886 HeapTupleHeaderClearHeapOnly(newtup.t_data);
2887 HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2888 newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
2889 InvalidOffsetNumber, false, true);
2890 if (newoff == InvalidOffsetNumber)
2891 elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
2892 (unsigned long) tuple_len, dst_vacpage->blkno);
2893 newitemid = PageGetItemId(dst_page, newoff);
2894 /* drop temporary copy, and point to the version on the dest page */
2895 pfree(newtup.t_data);
2896 newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
2898 ItemPointerSet(&(newtup.t_self), dst_vacpage->blkno, newoff);
2901 * Set new tuple's t_ctid pointing to itself if last tuple in chain, and
2902 * to next tuple in chain otherwise. (Since we move the chain in reverse
2903 * order, this is actually the previously processed tuple.)
2905 if (!ItemPointerIsValid(ctid))
2906 newtup.t_data->t_ctid = newtup.t_self;
2908 newtup.t_data->t_ctid = *ctid;
2909 *ctid = newtup.t_self;
2911 MarkBufferDirty(dst_buf);
2912 if (dst_buf != old_buf)
2913 MarkBufferDirty(old_buf);
2916 if (!rel->rd_istemp)
2918 XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
2921 if (old_buf != dst_buf)
2923 PageSetLSN(old_page, recptr);
2924 PageSetTLI(old_page, ThisTimeLineID);
2926 PageSetLSN(dst_page, recptr);
2927 PageSetTLI(dst_page, ThisTimeLineID);
2932 LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
2933 if (dst_buf != old_buf)
2934 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
2936 /* Create index entries for the moved tuple */
2937 if (ec->resultRelInfo->ri_NumIndices > 0)
2939 ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
2940 ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
2941 ResetPerTupleExprContext(ec->estate);
2946 * move_plain_tuple() -- move one tuple that is not part of a chain
2948 * This routine moves old_tup from old_page to dst_page.
2949 * On entry old_buf and dst_buf are locked exclusively, both locks are
2950 * released before exit.
2952 * Yes, a routine with eight parameters is ugly, but it's still better
2953 * than having these 90 lines of code in repair_frag() which is already
2954 * too long and almost unreadable.
2957 move_plain_tuple(Relation rel,
2958 Buffer old_buf, Page old_page, HeapTuple old_tup,
2959 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
2962 TransactionId myXID = GetCurrentTransactionId();
2963 HeapTupleData newtup;
2964 OffsetNumber newoff;
2966 Size tuple_len = old_tup->t_len;
2969 heap_copytuple_with_tuple(old_tup, &newtup);
2972 * register invalidation of source tuple in catcaches.
2974 * (Note: we do not need to register the copied tuple, because we are not
2975 * changing the tuple contents and so there cannot be any need to flush
2976 * negative catcache entries.)
2978 CacheInvalidateHeapTuple(rel, old_tup);
2980 /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
2981 START_CRIT_SECTION();
2984 * Mark new tuple as MOVED_IN by me; also mark it not HOT.
2986 newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2989 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2990 HeapTupleHeaderClearHotUpdated(newtup.t_data);
2991 HeapTupleHeaderClearHeapOnly(newtup.t_data);
2992 HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2994 /* add tuple to the page */
2995 newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
2996 InvalidOffsetNumber, false, true);
2997 if (newoff == InvalidOffsetNumber)
2998 elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2999 (unsigned long) tuple_len,
3000 dst_vacpage->blkno, (unsigned long) dst_vacpage->free,
3001 dst_vacpage->offsets_used, dst_vacpage->offsets_free);
3002 newitemid = PageGetItemId(dst_page, newoff);
3003 pfree(newtup.t_data);
3004 newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
3005 ItemPointerSet(&(newtup.t_data->t_ctid), dst_vacpage->blkno, newoff);
3006 newtup.t_self = newtup.t_data->t_ctid;
3009 * Mark old tuple as MOVED_OFF by me.
3011 old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
3014 old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
3015 HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
3017 MarkBufferDirty(dst_buf);
3018 MarkBufferDirty(old_buf);
3021 if (!rel->rd_istemp)
3023 XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
3026 PageSetLSN(old_page, recptr);
3027 PageSetTLI(old_page, ThisTimeLineID);
3028 PageSetLSN(dst_page, recptr);
3029 PageSetTLI(dst_page, ThisTimeLineID);
3034 dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page);
3035 LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
3036 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
3038 dst_vacpage->offsets_used++;
3040 /* insert index' tuples if needed */
3041 if (ec->resultRelInfo->ri_NumIndices > 0)
3043 ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
3044 ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
3045 ResetPerTupleExprContext(ec->estate);
3050 * update_hint_bits() -- update hint bits in destination pages
3052 * Scan all the pages that we moved tuples onto and update tuple status bits.
3053 * This is not really necessary, but it will save time for future transactions
3054 * examining these tuples.
3056 * This pass guarantees that all HEAP_MOVED_IN tuples are marked as
3057 * XMIN_COMMITTED, so that future tqual tests won't need to check their XVAC.
3059 * BUT NOTICE that this code fails to clear HEAP_MOVED_OFF tuples from
3060 * pages that were move source pages but not move dest pages. The bulk
3061 * of the move source pages will be physically truncated from the relation,
3062 * and the last page remaining in the rel will be fixed separately in
3063 * repair_frag(), so the only cases where a MOVED_OFF tuple won't get its
3064 * hint bits updated are tuples that are moved as part of a chain and were
3065 * on pages that were not either move destinations nor at the end of the rel.
3066 * To completely ensure that no MOVED_OFF tuples remain unmarked, we'd have
3067 * to remember and revisit those pages too.
3069 * One wonders whether it wouldn't be better to skip this work entirely,
3070 * and let the tuple status updates happen someplace that's not holding an
3071 * exclusive lock on the relation.
3074 update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
3075 BlockNumber last_move_dest_block, int num_moved)
3077 TransactionId myXID = GetCurrentTransactionId();
3078 int checked_moved = 0;
3082 for (i = 0, curpage = fraged_pages->pagedesc;
3083 i < num_fraged_pages;
3088 OffsetNumber max_offset;
3092 vacuum_delay_point();
3094 if ((*curpage)->blkno > last_move_dest_block)
3095 break; /* no need to scan any further */
3096 if ((*curpage)->offsets_used == 0)
3097 continue; /* this page was never used as a move dest */
3098 buf = ReadBufferWithStrategy(rel, (*curpage)->blkno, vac_strategy);
3099 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
3100 page = BufferGetPage(buf);
3101 max_offset = PageGetMaxOffsetNumber(page);
3102 for (off = FirstOffsetNumber;
3104 off = OffsetNumberNext(off))
3106 ItemId itemid = PageGetItemId(page, off);
3107 HeapTupleHeader htup;
3109 if (!ItemIdIsUsed(itemid))
3111 /* Shouldn't be any DEAD or REDIRECT items anymore */
3112 Assert(ItemIdIsNormal(itemid));
3114 htup = (HeapTupleHeader) PageGetItem(page, itemid);
3115 if (htup->t_infomask & HEAP_XMIN_COMMITTED)
3119 * Here we may see either MOVED_OFF or MOVED_IN tuples.
3121 if (!(htup->t_infomask & HEAP_MOVED))
3122 elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
3123 if (HeapTupleHeaderGetXvac(htup) != myXID)
3124 elog(ERROR, "invalid XVAC in tuple header");
3126 if (htup->t_infomask & HEAP_MOVED_IN)
3128 htup->t_infomask |= HEAP_XMIN_COMMITTED;
3129 htup->t_infomask &= ~HEAP_MOVED;
3133 htup->t_infomask |= HEAP_XMIN_INVALID;
3135 MarkBufferDirty(buf);
3136 UnlockReleaseBuffer(buf);
3137 Assert((*curpage)->offsets_used == num_tuples);
3138 checked_moved += num_tuples;
3140 Assert(num_moved == checked_moved);
3144 * vacuum_heap() -- free dead tuples
3146 * This routine marks dead tuples as unused and truncates relation
3147 * if there are "empty" end-blocks.
3150 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
3154 BlockNumber relblocks;
3158 nblocks = vacuum_pages->num_pages;
3159 nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with them */
3161 for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
3163 vacuum_delay_point();
3165 if ((*vacpage)->offsets_free > 0)
3167 buf = ReadBufferWithStrategy(onerel,
3170 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
3171 vacuum_page(onerel, buf, *vacpage);
3172 UnlockReleaseBuffer(buf);
3176 /* Truncate relation if there are some empty end-pages */
3177 Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
3178 if (vacuum_pages->empty_end_pages > 0)
3180 relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
3182 (errmsg("\"%s\": truncated %u to %u pages",
3183 RelationGetRelationName(onerel),
3184 vacrelstats->rel_pages, relblocks)));
3185 RelationTruncate(onerel, relblocks);
3186 vacrelstats->rel_pages = relblocks; /* set new number of blocks */
3191 * vacuum_page() -- free dead tuples on a page
3192 * and repair its fragmentation.
3194 * Caller must hold pin and lock on buffer.
3197 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
3199 Page page = BufferGetPage(buffer);
3202 /* There shouldn't be any tuples moved onto the page yet! */
3203 Assert(vacpage->offsets_used == 0);
3205 START_CRIT_SECTION();
3207 for (i = 0; i < vacpage->offsets_free; i++)
3209 ItemId itemid = PageGetItemId(page, vacpage->offsets[i]);
3211 ItemIdSetUnused(itemid);
3214 PageRepairFragmentation(page);
3216 MarkBufferDirty(buffer);
3219 if (!onerel->rd_istemp)
3223 recptr = log_heap_clean(onerel, buffer,
3225 vacpage->offsets, vacpage->offsets_free,
3227 PageSetLSN(page, recptr);
3228 PageSetTLI(page, ThisTimeLineID);
3235 * scan_index() -- scan one index relation to update pg_class statistics.
3237 * We use this when we have no deletions to do.
3240 scan_index(Relation indrel, double num_tuples)
3242 IndexBulkDeleteResult *stats;
3243 IndexVacuumInfo ivinfo;
3246 pg_rusage_init(&ru0);
3248 ivinfo.index = indrel;
3249 ivinfo.vacuum_full = true;
3250 ivinfo.message_level = elevel;
3251 ivinfo.num_heap_tuples = num_tuples;
3252 ivinfo.strategy = vac_strategy;
3254 stats = index_vacuum_cleanup(&ivinfo, NULL);
3259 /* now update statistics in pg_class */
3260 vac_update_relstats(RelationGetRelid(indrel),
3261 stats->num_pages, stats->num_index_tuples,
3262 false, InvalidTransactionId);
3265 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3266 RelationGetRelationName(indrel),
3267 stats->num_index_tuples,
3269 errdetail("%u index pages have been deleted, %u are currently reusable.\n"
3271 stats->pages_deleted, stats->pages_free,
3272 pg_rusage_show(&ru0))));
3275 * Check for tuple count mismatch. If the index is partial, then it's OK
3276 * for it to have fewer tuples than the heap; else we got trouble.
3278 if (stats->num_index_tuples != num_tuples)
3280 if (stats->num_index_tuples > num_tuples ||
3281 !vac_is_partial_index(indrel))
3283 (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
3284 RelationGetRelationName(indrel),
3285 stats->num_index_tuples, num_tuples),
3286 errhint("Rebuild the index with REINDEX.")));
3293 * vacuum_index() -- vacuum one index relation.
3295 * Vpl is the VacPageList of the heap we're currently vacuuming.
3296 * It's locked. Indrel is an index relation on the vacuumed heap.
3298 * We don't bother to set locks on the index relation here, since
3299 * the parent table is exclusive-locked already.
3301 * Finally, we arrange to update the index relation's statistics in
3305 vacuum_index(VacPageList vacpagelist, Relation indrel,
3306 double num_tuples, int keep_tuples)
3308 IndexBulkDeleteResult *stats;
3309 IndexVacuumInfo ivinfo;
3312 pg_rusage_init(&ru0);
3314 ivinfo.index = indrel;
3315 ivinfo.vacuum_full = true;
3316 ivinfo.message_level = elevel;
3317 ivinfo.num_heap_tuples = num_tuples + keep_tuples;
3318 ivinfo.strategy = vac_strategy;
3320 /* Do bulk deletion */
3321 stats = index_bulk_delete(&ivinfo, NULL, tid_reaped, (void *) vacpagelist);
3323 /* Do post-VACUUM cleanup */
3324 stats = index_vacuum_cleanup(&ivinfo, stats);
3329 /* now update statistics in pg_class */
3330 vac_update_relstats(RelationGetRelid(indrel),
3331 stats->num_pages, stats->num_index_tuples,
3332 false, InvalidTransactionId);
3335 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
3336 RelationGetRelationName(indrel),
3337 stats->num_index_tuples,
3339 errdetail("%.0f index row versions were removed.\n"
3340 "%u index pages have been deleted, %u are currently reusable.\n"
3342 stats->tuples_removed,
3343 stats->pages_deleted, stats->pages_free,
3344 pg_rusage_show(&ru0))));
3347 * Check for tuple count mismatch. If the index is partial, then it's OK
3348 * for it to have fewer tuples than the heap; else we got trouble.
3350 if (stats->num_index_tuples != num_tuples + keep_tuples)
3352 if (stats->num_index_tuples > num_tuples + keep_tuples ||
3353 !vac_is_partial_index(indrel))
3355 (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
3356 RelationGetRelationName(indrel),
3357 stats->num_index_tuples, num_tuples + keep_tuples),
3358 errhint("Rebuild the index with REINDEX.")));
3365 * tid_reaped() -- is a particular tid reaped?
3367 * This has the right signature to be an IndexBulkDeleteCallback.
3369 * vacpagelist->VacPage_array is sorted in right order.
3372 tid_reaped(ItemPointer itemptr, void *state)
3374 VacPageList vacpagelist = (VacPageList) state;
3375 OffsetNumber ioffno;
3379 VacPageData vacpage;
3381 vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
3382 ioffno = ItemPointerGetOffsetNumber(itemptr);
3385 vpp = (VacPage *) vac_bsearch((void *) &vp,
3386 (void *) (vacpagelist->pagedesc),
3387 vacpagelist->num_pages,
3394 /* ok - we are on a partially or fully reaped page */
3397 if (vp->offsets_free == 0)
3399 /* this is EmptyPage, so claim all tuples on it are reaped!!! */
3403 voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
3404 (void *) (vp->offsets),
3406 sizeof(OffsetNumber),
3417 * Update the shared Free Space Map with the info we now have about
3418 * free space in the relation, discarding any old info the map may have.
3421 vac_update_fsm(Relation onerel, VacPageList fraged_pages,
3422 BlockNumber rel_pages)
3424 int nPages = fraged_pages->num_pages;
3425 VacPage *pagedesc = fraged_pages->pagedesc;
3427 PageFreeSpaceInfo *pageSpaces;
3432 * We only report pages with free space at least equal to the average
3433 * request size --- this avoids cluttering FSM with uselessly-small bits
3434 * of space. Although FSM would discard pages with little free space
3435 * anyway, it's important to do this prefiltering because (a) it reduces
3436 * the time spent holding the FSM lock in RecordRelationFreeSpace, and (b)
3437 * FSM uses the number of pages reported as a statistic for guiding space
3438 * management. If we didn't threshold our reports the same way
3439 * vacuumlazy.c does, we'd be skewing that statistic.
3441 threshold = GetAvgFSMRequestSize(&onerel->rd_node);
3443 pageSpaces = (PageFreeSpaceInfo *)
3444 palloc(nPages * sizeof(PageFreeSpaceInfo));
3447 for (i = 0; i < nPages; i++)
3450 * fraged_pages may contain entries for pages that we later decided to
3451 * truncate from the relation; don't enter them into the free space
3454 if (pagedesc[i]->blkno >= rel_pages)
3457 if (pagedesc[i]->free >= threshold)
3459 pageSpaces[outPages].blkno = pagedesc[i]->blkno;
3460 pageSpaces[outPages].avail = pagedesc[i]->free;
3465 RecordRelationFreeSpace(&onerel->rd_node, outPages, outPages, pageSpaces);
3470 /* Copy a VacPage structure */
3472 copy_vac_page(VacPage vacpage)
3476 /* allocate a VacPageData entry */
3477 newvacpage = (VacPage) palloc(sizeof(VacPageData) +
3478 vacpage->offsets_free * sizeof(OffsetNumber));
3481 if (vacpage->offsets_free > 0)
3482 memcpy(newvacpage->offsets, vacpage->offsets,
3483 vacpage->offsets_free * sizeof(OffsetNumber));
3484 newvacpage->blkno = vacpage->blkno;
3485 newvacpage->free = vacpage->free;
3486 newvacpage->offsets_used = vacpage->offsets_used;
3487 newvacpage->offsets_free = vacpage->offsets_free;
3493 * Add a VacPage pointer to a VacPageList.
3495 * As a side effect of the way that scan_heap works,
3496 * higher pages come after lower pages in the array
3497 * (and highest tid on a page is last).
3500 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
3502 #define PG_NPAGEDESC 1024
3504 /* allocate a VacPage entry if needed */
3505 if (vacpagelist->num_pages == 0)
3507 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
3508 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
3510 else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
3512 vacpagelist->num_allocated_pages *= 2;
3513 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
3515 vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
3516 (vacpagelist->num_pages)++;
3520 * vac_bsearch: just like standard C library routine bsearch(),
3521 * except that we first test to see whether the target key is outside
3522 * the range of the table entries. This case is handled relatively slowly
3523 * by the normal binary search algorithm (ie, no faster than any other key)
3524 * but it occurs often enough in VACUUM to be worth optimizing.
3527 vac_bsearch(const void *key, const void *base,
3528 size_t nelem, size_t size,
3529 int (*compar) (const void *, const void *))
3536 res = compar(key, base);
3540 return (void *) base;
3543 last = (const void *) ((const char *) base + (nelem - 1) * size);
3544 res = compar(key, last);
3548 return (void *) last;
3551 return NULL; /* already checked 'em all */
3552 return bsearch(key, base, nelem, size, compar);
3556 * Comparator routines for use with qsort() and bsearch().
3559 vac_cmp_blk(const void *left, const void *right)
3564 lblk = (*((VacPage *) left))->blkno;
3565 rblk = (*((VacPage *) right))->blkno;
3575 vac_cmp_offno(const void *left, const void *right)
3577 if (*(OffsetNumber *) left < *(OffsetNumber *) right)
3579 if (*(OffsetNumber *) left == *(OffsetNumber *) right)
3585 vac_cmp_vtlinks(const void *left, const void *right)
3587 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
3588 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
3590 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
3591 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
3593 /* bi_hi-es are equal */
3594 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
3595 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
3597 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
3598 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
3600 /* bi_lo-es are equal */
3601 if (((VTupleLink) left)->new_tid.ip_posid <
3602 ((VTupleLink) right)->new_tid.ip_posid)
3604 if (((VTupleLink) left)->new_tid.ip_posid >
3605 ((VTupleLink) right)->new_tid.ip_posid)
3612 * Open all the indexes of the given relation, obtaining the specified kind
3613 * of lock on each. Return an array of Relation pointers for the indexes
3614 * into *Irel, and the number of indexes into *nindexes.
3617 vac_open_indexes(Relation relation, LOCKMODE lockmode,
3618 int *nindexes, Relation **Irel)
3621 ListCell *indexoidscan;
3624 Assert(lockmode != NoLock);
3626 indexoidlist = RelationGetIndexList(relation);
3628 *nindexes = list_length(indexoidlist);
3631 *Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
3636 foreach(indexoidscan, indexoidlist)
3638 Oid indexoid = lfirst_oid(indexoidscan);
3640 (*Irel)[i++] = index_open(indexoid, lockmode);
3643 list_free(indexoidlist);
3647 * Release the resources acquired by vac_open_indexes. Optionally release
3648 * the locks (say NoLock to keep 'em).
3651 vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
3658 Relation ind = Irel[nindexes];
3660 index_close(ind, lockmode);
3667 * Is an index partial (ie, could it contain fewer tuples than the heap?)
3670 vac_is_partial_index(Relation indrel)
3673 * If the index's AM doesn't support nulls, it's partial for our purposes
3675 if (!indrel->rd_am->amindexnulls)
3678 /* Otherwise, look to see if there's a partial-index predicate */
3679 if (!heap_attisnull(indrel->rd_indextuple, Anum_pg_index_indpred))
3687 enough_space(VacPage vacpage, Size len)
3689 len = MAXALIGN(len);
3691 if (len > vacpage->free)
3694 /* if there are free itemid(s) and len <= free_space... */
3695 if (vacpage->offsets_used < vacpage->offsets_free)
3698 /* noff_used >= noff_free and so we'll have to allocate new itemid */
3699 if (len + sizeof(ItemIdData) <= vacpage->free)
3706 PageGetFreeSpaceWithFillFactor(Relation relation, Page page)
3708 Size freespace = PageGetHeapFreeSpace(page);
3711 targetfree = RelationGetTargetPageFreeSpace(relation,
3712 HEAP_DEFAULT_FILLFACTOR);
3713 if (freespace > targetfree)
3714 return freespace - targetfree;
3720 * vacuum_delay_point --- check for interrupts and cost-based delay.
3722 * This should be called in each major loop of VACUUM processing,
3723 * typically once per page processed.
3726 vacuum_delay_point(void)
3728 /* Always check for interrupts */
3729 CHECK_FOR_INTERRUPTS();
3731 /* Nap if appropriate */
3732 if (VacuumCostActive && !InterruptPending &&
3733 VacuumCostBalance >= VacuumCostLimit)
3737 msec = VacuumCostDelay * VacuumCostBalance / VacuumCostLimit;
3738 if (msec > VacuumCostDelay * 4)
3739 msec = VacuumCostDelay * 4;
3741 pg_usleep(msec * 1000L);
3743 VacuumCostBalance = 0;
3745 /* update balance values for workers */
3746 AutoVacuumUpdateDelay();
3748 /* Might have gotten an interrupt while sleeping */
3749 CHECK_FOR_INTERRUPTS();