1 /*-------------------------------------------------------------------------
4 * The postgres vacuum cleaner.
6 * This file includes the "full" version of VACUUM, as well as control code
7 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE. See
8 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
11 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
16 * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.311 2005/07/14 05:13:39 tgl Exp $
18 *-------------------------------------------------------------------------
25 #include "access/clog.h"
26 #include "access/genam.h"
27 #include "access/heapam.h"
28 #include "access/subtrans.h"
29 #include "access/xlog.h"
30 #include "catalog/catalog.h"
31 #include "catalog/namespace.h"
32 #include "catalog/pg_database.h"
33 #include "catalog/pg_index.h"
34 #include "commands/dbcommands.h"
35 #include "commands/vacuum.h"
36 #include "executor/executor.h"
37 #include "miscadmin.h"
38 #include "storage/freespace.h"
39 #include "storage/procarray.h"
40 #include "storage/smgr.h"
41 #include "tcop/pquery.h"
42 #include "utils/acl.h"
43 #include "utils/builtins.h"
44 #include "utils/fmgroids.h"
45 #include "utils/inval.h"
46 #include "utils/lsyscache.h"
47 #include "utils/memutils.h"
48 #include "utils/relcache.h"
49 #include "utils/syscache.h"
53 typedef struct VacPageData
55 BlockNumber blkno; /* BlockNumber of this Page */
56 Size free; /* FreeSpace on this Page */
57 uint16 offsets_used; /* Number of OffNums used by vacuum */
58 uint16 offsets_free; /* Number of OffNums free or to be free */
59 OffsetNumber offsets[1]; /* Array of free OffNums */
62 typedef VacPageData *VacPage;
64 typedef struct VacPageListData
66 BlockNumber empty_end_pages; /* Number of "empty" end-pages */
67 int num_pages; /* Number of pages in pagedesc */
68 int num_allocated_pages; /* Number of allocated pages in
70 VacPage *pagedesc; /* Descriptions of pages */
73 typedef VacPageListData *VacPageList;
75 typedef struct VTupleLinkData
77 ItemPointerData new_tid;
78 ItemPointerData this_tid;
81 typedef VTupleLinkData *VTupleLink;
83 typedef struct VTupleMoveData
85 ItemPointerData tid; /* tuple ID */
86 VacPage vacpage; /* where to move */
87 bool cleanVpd; /* clean vacpage before using */
90 typedef VTupleMoveData *VTupleMove;
92 typedef struct VRelStats
94 BlockNumber rel_pages;
103 /*----------------------------------------------------------------------
106 * As these variables always appear together, we put them into one struct
107 * and pull initialization and cleanup into separate routines.
108 * ExecContext is used by repair_frag() and move_xxx_tuple(). More
109 * accurately: It is *used* only in move_xxx_tuple(), but because this
110 * routine is called many times, we initialize the struct just once in
111 * repair_frag() and pass it on to move_xxx_tuple().
113 typedef struct ExecContextData
115 ResultRelInfo *resultRelInfo;
117 TupleTableSlot *slot;
119 typedef ExecContextData *ExecContext;
122 ExecContext_Init(ExecContext ec, Relation rel)
124 TupleDesc tupdesc = RelationGetDescr(rel);
127 * We need a ResultRelInfo and an EState so we can use the regular
128 * executor's index-entry-making machinery.
130 ec->estate = CreateExecutorState();
132 ec->resultRelInfo = makeNode(ResultRelInfo);
133 ec->resultRelInfo->ri_RangeTableIndex = 1; /* dummy */
134 ec->resultRelInfo->ri_RelationDesc = rel;
135 ec->resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */
137 ExecOpenIndices(ec->resultRelInfo);
139 ec->estate->es_result_relations = ec->resultRelInfo;
140 ec->estate->es_num_result_relations = 1;
141 ec->estate->es_result_relation_info = ec->resultRelInfo;
143 /* Set up a tuple slot too */
144 ec->slot = MakeSingleTupleTableSlot(tupdesc);
148 ExecContext_Finish(ExecContext ec)
150 ExecDropSingleTupleTableSlot(ec->slot);
151 ExecCloseIndices(ec->resultRelInfo);
152 FreeExecutorState(ec->estate);
156 * End of ExecContext Implementation
157 *----------------------------------------------------------------------
160 static MemoryContext vac_context = NULL;
162 static int elevel = -1;
164 static TransactionId OldestXmin;
165 static TransactionId FreezeLimit;
168 /* non-export function prototypes */
169 static List *get_rel_oids(List *relids, const RangeVar *vacrel,
170 const char *stmttype);
171 static void vac_update_dbstats(Oid dbid,
172 TransactionId vacuumXID,
173 TransactionId frozenXID);
174 static void vac_truncate_clog(TransactionId vacuumXID,
175 TransactionId frozenXID);
176 static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
177 static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
178 static void scan_heap(VRelStats *vacrelstats, Relation onerel,
179 VacPageList vacuum_pages, VacPageList fraged_pages);
180 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
181 VacPageList vacuum_pages, VacPageList fraged_pages,
182 int nindexes, Relation *Irel);
183 static void move_chain_tuple(Relation rel,
184 Buffer old_buf, Page old_page, HeapTuple old_tup,
185 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
186 ExecContext ec, ItemPointer ctid, bool cleanVpd);
187 static void move_plain_tuple(Relation rel,
188 Buffer old_buf, Page old_page, HeapTuple old_tup,
189 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
191 static void update_hint_bits(Relation rel, VacPageList fraged_pages,
192 int num_fraged_pages, BlockNumber last_move_dest_block,
194 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
195 VacPageList vacpagelist);
196 static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
197 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
198 double num_tuples, int keep_tuples);
199 static void scan_index(Relation indrel, double num_tuples);
200 static bool tid_reaped(ItemPointer itemptr, void *state);
201 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
202 static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
203 BlockNumber rel_pages);
204 static VacPage copy_vac_page(VacPage vacpage);
205 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
206 static void *vac_bsearch(const void *key, const void *base,
207 size_t nelem, size_t size,
208 int (*compar) (const void *, const void *));
209 static int vac_cmp_blk(const void *left, const void *right);
210 static int vac_cmp_offno(const void *left, const void *right);
211 static int vac_cmp_vtlinks(const void *left, const void *right);
212 static bool enough_space(VacPage vacpage, Size len);
215 /****************************************************************************
217 * Code common to all flavors of VACUUM and ANALYZE *
219 ****************************************************************************
224 * Primary entry point for VACUUM and ANALYZE commands.
226 * relids is normally NIL; if it is not, then it provides the list of
227 * relation OIDs to be processed, and vacstmt->relation is ignored.
228 * (The non-NIL case is currently only used by autovacuum.)
230 * It is the caller's responsibility that both vacstmt and relids
231 * (if given) be allocated in a memory context that won't disappear
232 * at transaction commit. In fact this context must be QueryContext
233 * to avoid complaints from PreventTransactionChain.
236 vacuum(VacuumStmt *vacstmt, List *relids)
238 const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
239 TransactionId initialOldestXmin = InvalidTransactionId;
240 TransactionId initialFreezeLimit = InvalidTransactionId;
241 volatile MemoryContext anl_context = NULL;
242 volatile bool all_rels,
247 if (vacstmt->verbose)
253 * We cannot run VACUUM inside a user transaction block; if we were
254 * inside a transaction, then our commit- and
255 * start-transaction-command calls would not have the intended effect!
256 * Furthermore, the forced commit that occurs before truncating the
257 * relation's file would have the effect of committing the rest of the
258 * user's transaction too, which would certainly not be the desired
259 * behavior. (This only applies to VACUUM FULL, though. We could in
260 * theory run lazy VACUUM inside a transaction block, but we choose to
261 * disallow that case because we'd rather commit as soon as possible
262 * after finishing the vacuum. This is mainly so that we can let go
263 * the AccessExclusiveLock that we may be holding.)
265 * ANALYZE (without VACUUM) can run either way.
269 PreventTransactionChain((void *) vacstmt, stmttype);
270 in_outer_xact = false;
273 in_outer_xact = IsInTransactionChain((void *) vacstmt);
276 * Disallow the combination VACUUM FULL FREEZE; although it would mostly
277 * work, VACUUM FULL's ability to move tuples around means that it is
278 * injecting its own XID into tuple visibility checks. We'd have to
279 * guarantee that every moved tuple is properly marked XMIN_COMMITTED or
280 * XMIN_INVALID before the end of the operation. There are corner cases
281 * where this does not happen, and getting rid of them all seems hard
282 * (not to mention fragile to maintain). On the whole it's not worth it
283 * compared to telling people to use two operations. See pgsql-hackers
284 * discussion of 27-Nov-2004, and comments below for update_hint_bits().
286 * Note: this is enforced here, and not in the grammar, since (a) we can
287 * give a better error message, and (b) we might want to allow it again
290 if (vacstmt->vacuum && vacstmt->full && vacstmt->freeze)
292 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
293 errmsg("VACUUM FULL FREEZE is not supported"),
294 errhint("Use VACUUM FULL, then VACUUM FREEZE.")));
297 * Send info about dead objects to the statistics collector
300 pgstat_vacuum_tabstat();
303 * Create special memory context for cross-transaction storage.
305 * Since it is a child of PortalContext, it will go away eventually even
306 * if we suffer an error; there's no need for special abort cleanup
309 vac_context = AllocSetContextCreate(PortalContext,
311 ALLOCSET_DEFAULT_MINSIZE,
312 ALLOCSET_DEFAULT_INITSIZE,
313 ALLOCSET_DEFAULT_MAXSIZE);
315 /* Remember whether we are processing everything in the DB */
316 all_rels = (relids == NIL && vacstmt->relation == NULL);
319 * Build list of relations to process, unless caller gave us one.
320 * (If we build one, we put it in vac_context for safekeeping.)
322 relations = get_rel_oids(relids, vacstmt->relation, stmttype);
324 if (vacstmt->vacuum && all_rels)
327 * It's a database-wide VACUUM.
329 * Compute the initially applicable OldestXmin and FreezeLimit XIDs,
330 * so that we can record these values at the end of the VACUUM.
331 * Note that individual tables may well be processed with newer
332 * values, but we can guarantee that no (non-shared) relations are
333 * processed with older ones.
335 * It is okay to record non-shared values in pg_database, even though
336 * we may vacuum shared relations with older cutoffs, because only
337 * the minimum of the values present in pg_database matters. We
338 * can be sure that shared relations have at some time been
339 * vacuumed with cutoffs no worse than the global minimum; for, if
340 * there is a backend in some other DB with xmin = OLDXMIN that's
341 * determining the cutoff with which we vacuum shared relations,
342 * it is not possible for that database to have a cutoff newer
343 * than OLDXMIN recorded in pg_database.
345 vacuum_set_xid_limits(vacstmt, false,
347 &initialFreezeLimit);
351 * Decide whether we need to start/commit our own transactions.
353 * For VACUUM (with or without ANALYZE): always do so, so that we can
354 * release locks as soon as possible. (We could possibly use the
355 * outer transaction for a one-table VACUUM, but handling TOAST tables
356 * would be problematic.)
358 * For ANALYZE (no VACUUM): if inside a transaction block, we cannot
359 * start/commit our own transactions. Also, there's no need to do so
360 * if only processing one relation. For multiple relations when not
361 * within a transaction block, use own transactions so we can release
365 use_own_xacts = true;
368 Assert(vacstmt->analyze);
370 use_own_xacts = false;
371 else if (list_length(relations) > 1)
372 use_own_xacts = true;
374 use_own_xacts = false;
378 * If we are running ANALYZE without per-table transactions, we'll
379 * need a memory context with table lifetime.
382 anl_context = AllocSetContextCreate(PortalContext,
384 ALLOCSET_DEFAULT_MINSIZE,
385 ALLOCSET_DEFAULT_INITSIZE,
386 ALLOCSET_DEFAULT_MAXSIZE);
389 * vacuum_rel expects to be entered with no transaction active; it
390 * will start and commit its own transaction. But we are called by an
391 * SQL command, and so we are executing inside a transaction already.
392 * We commit the transaction started in PostgresMain() here, and start
393 * another one before exiting to match the commit waiting for us back
398 /* matches the StartTransaction in PostgresMain() */
399 CommitTransactionCommand();
402 /* Turn vacuum cost accounting on or off */
407 VacuumCostActive = (VacuumCostDelay > 0);
408 VacuumCostBalance = 0;
411 * Loop to process each selected relation.
413 foreach(cur, relations)
415 Oid relid = lfirst_oid(cur);
419 if (!vacuum_rel(relid, vacstmt, RELKIND_RELATION))
420 all_rels = false; /* forget about updating dbstats */
422 if (vacstmt->analyze)
424 MemoryContext old_context = NULL;
427 * If using separate xacts, start one for analyze.
428 * Otherwise, we can use the outer transaction, but we
429 * still need to call analyze_rel in a memory context that
430 * will be cleaned up on return (else we leak memory while
431 * processing multiple tables).
435 StartTransactionCommand();
436 /* functions in indexes may want a snapshot set */
437 ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
440 old_context = MemoryContextSwitchTo(anl_context);
443 * Tell the buffer replacement strategy that vacuum is
446 StrategyHintVacuum(true);
448 analyze_rel(relid, vacstmt);
450 StrategyHintVacuum(false);
453 CommitTransactionCommand();
456 MemoryContextSwitchTo(old_context);
457 MemoryContextResetAndDeleteChildren(anl_context);
464 /* Make sure cost accounting is turned off after error */
465 VacuumCostActive = false;
470 /* Turn off vacuum cost accounting */
471 VacuumCostActive = false;
474 * Finish up processing.
478 /* here, we are not in a transaction */
481 * This matches the CommitTransaction waiting for us in
484 StartTransactionCommand();
490 * If it was a database-wide VACUUM, print FSM usage statistics
491 * (we don't make you be superuser to see these).
493 if (vacstmt->relation == NULL)
494 PrintFreeSpaceMapStatistics(elevel);
497 * If we completed a database-wide VACUUM without skipping any
498 * relations, update the database's pg_database row with info
499 * about the transaction IDs used, and try to truncate pg_clog.
503 vac_update_dbstats(MyDatabaseId,
504 initialOldestXmin, initialFreezeLimit);
505 vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
510 * Clean up working storage --- note we must do this after
511 * StartTransactionCommand, else we might be trying to delete the
514 MemoryContextDelete(vac_context);
518 MemoryContextDelete(anl_context);
522 * Build a list of Oids for each relation to be processed
524 * The list is built in vac_context so that it will survive across our
525 * per-relation transactions.
528 get_rel_oids(List *relids, const RangeVar *vacrel, const char *stmttype)
530 List *oid_list = NIL;
531 MemoryContext oldcontext;
533 /* List supplied by VACUUM's caller? */
539 /* Process a specific relation */
542 relid = RangeVarGetRelid(vacrel, false);
544 /* Make a relation list entry for this guy */
545 oldcontext = MemoryContextSwitchTo(vac_context);
546 oid_list = lappend_oid(oid_list, relid);
547 MemoryContextSwitchTo(oldcontext);
551 /* Process all plain relations listed in pg_class */
558 Anum_pg_class_relkind,
559 BTEqualStrategyNumber, F_CHAREQ,
560 CharGetDatum(RELKIND_RELATION));
562 pgclass = heap_open(RelationRelationId, AccessShareLock);
564 scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
566 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
568 /* Make a relation list entry for this guy */
569 oldcontext = MemoryContextSwitchTo(vac_context);
570 oid_list = lappend_oid(oid_list, HeapTupleGetOid(tuple));
571 MemoryContextSwitchTo(oldcontext);
575 heap_close(pgclass, AccessShareLock);
582 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
585 vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
586 TransactionId *oldestXmin,
587 TransactionId *freezeLimit)
591 *oldestXmin = GetOldestXmin(sharedRel);
593 Assert(TransactionIdIsNormal(*oldestXmin));
597 /* FREEZE option: use oldest Xmin as freeze cutoff too */
603 * Normal case: freeze cutoff is well in the past, to wit, about
604 * halfway to the wrap horizon
606 limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
610 * Be careful not to generate a "permanent" XID
612 if (!TransactionIdIsNormal(limit))
613 limit = FirstNormalTransactionId;
616 * Ensure sane relationship of limits
618 if (TransactionIdFollows(limit, *oldestXmin))
621 (errmsg("oldest xmin is far in the past"),
622 errhint("Close open transactions soon to avoid wraparound problems.")));
626 *freezeLimit = limit;
631 * vac_update_relstats() -- update statistics for one relation
633 * Update the whole-relation statistics that are kept in its pg_class
634 * row. There are additional stats that will be updated if we are
635 * doing ANALYZE, but we always update these stats. This routine works
636 * for both index and heap relation entries in pg_class.
638 * We violate no-overwrite semantics here by storing new values for the
639 * statistics columns directly into the pg_class tuple that's already on
640 * the page. The reason for this is that if we updated these tuples in
641 * the usual way, vacuuming pg_class itself wouldn't work very well ---
642 * by the time we got done with a vacuum cycle, most of the tuples in
643 * pg_class would've been obsoleted. Of course, this only works for
644 * fixed-size never-null columns, but these are.
646 * This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
650 vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
656 Form_pg_class pgcform;
660 * update number of tuples and number of pages in pg_class
662 rd = heap_open(RelationRelationId, RowExclusiveLock);
664 ctup = SearchSysCache(RELOID,
665 ObjectIdGetDatum(relid),
667 if (!HeapTupleIsValid(ctup))
668 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
671 /* get the buffer cache tuple */
672 rtup.t_self = ctup->t_self;
673 ReleaseSysCache(ctup);
674 if (!heap_fetch(rd, SnapshotNow, &rtup, &buffer, false, NULL))
675 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
678 /* ensure no one else does this at the same time */
679 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
681 /* overwrite the existing statistics in the tuple */
682 pgcform = (Form_pg_class) GETSTRUCT(&rtup);
683 pgcform->relpages = (int32) num_pages;
684 pgcform->reltuples = (float4) num_tuples;
685 pgcform->relhasindex = hasindex;
688 * If we have discovered that there are no indexes, then there's no
689 * primary key either. This could be done more thoroughly...
692 pgcform->relhaspkey = false;
694 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
697 * Invalidate the tuple in the catcaches; this also arranges to flush
698 * the relation's relcache entry. (If we fail to commit for some
699 * reason, no flush will occur, but no great harm is done since there
700 * are no noncritical state updates here.)
702 CacheInvalidateHeapTuple(rd, &rtup);
704 /* Write the buffer */
707 heap_close(rd, RowExclusiveLock);
712 * vac_update_dbstats() -- update statistics for one database
714 * Update the whole-database statistics that are kept in its pg_database
717 * We violate no-overwrite semantics here by storing new values for the
718 * statistics columns directly into the tuple that's already on the page.
719 * As with vac_update_relstats, this avoids leaving dead tuples behind
722 * This routine is shared by full and lazy VACUUM. Note that it is only
723 * applied after a database-wide VACUUM operation.
725 * Note that we don't bother to update the flat-file copy of pg_database.
728 vac_update_dbstats(Oid dbid,
729 TransactionId vacuumXID,
730 TransactionId frozenXID)
733 ScanKeyData entry[1];
736 Form_pg_database dbform;
738 relation = heap_open(DatabaseRelationId, RowExclusiveLock);
740 /* Must use a heap scan, since there's no syscache for pg_database */
741 ScanKeyInit(&entry[0],
742 ObjectIdAttributeNumber,
743 BTEqualStrategyNumber, F_OIDEQ,
744 ObjectIdGetDatum(dbid));
746 scan = heap_beginscan(relation, SnapshotNow, 1, entry);
748 tuple = heap_getnext(scan, ForwardScanDirection);
750 if (!HeapTupleIsValid(tuple))
751 elog(ERROR, "could not find tuple for database %u", dbid);
753 /* ensure no one else does this at the same time */
754 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_EXCLUSIVE);
756 dbform = (Form_pg_database) GETSTRUCT(tuple);
758 /* overwrite the existing statistics in the tuple */
759 dbform->datvacuumxid = vacuumXID;
760 dbform->datfrozenxid = frozenXID;
762 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
764 /* invalidate the tuple in the cache and write the buffer */
765 CacheInvalidateHeapTuple(relation, tuple);
766 WriteNoReleaseBuffer(scan->rs_cbuf);
770 heap_close(relation, RowExclusiveLock);
775 * vac_truncate_clog() -- attempt to truncate the commit log
777 * Scan pg_database to determine the system-wide oldest datvacuumxid,
778 * and use it to truncate the transaction commit log (pg_clog).
779 * Also update the XID wrap limit point maintained by varsup.c.
781 * We also generate a warning if the system-wide oldest datfrozenxid
782 * seems to be in danger of wrapping around. This is a long-in-advance
783 * warning; if we start getting uncomfortably close, GetNewTransactionId
784 * will generate more-annoying warnings, and ultimately refuse to issue
787 * The passed XIDs are simply the ones I just wrote into my pg_database
788 * entry. They're used to initialize the "min" calculations.
790 * This routine is shared by full and lazy VACUUM. Note that it is only
791 * applied after a database-wide VACUUM operation.
794 vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
796 TransactionId myXID = GetCurrentTransactionId();
801 NameData oldest_datname;
802 bool vacuumAlreadyWrapped = false;
803 bool frozenAlreadyWrapped = false;
805 /* init oldest_datname to sync with my frozenXID */
806 namestrcpy(&oldest_datname, get_database_name(MyDatabaseId));
809 * Note: the "already wrapped" cases should now be impossible due to the
810 * defenses in GetNewTransactionId, but we keep them anyway.
812 relation = heap_open(DatabaseRelationId, AccessShareLock);
814 scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
816 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
818 Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
820 /* Ignore non-connectable databases (eg, template0) */
821 /* It's assumed that these have been frozen correctly */
822 if (!dbform->datallowconn)
825 if (TransactionIdIsNormal(dbform->datvacuumxid))
827 if (TransactionIdPrecedes(myXID, dbform->datvacuumxid))
828 vacuumAlreadyWrapped = true;
829 else if (TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
830 vacuumXID = dbform->datvacuumxid;
832 if (TransactionIdIsNormal(dbform->datfrozenxid))
834 if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
835 frozenAlreadyWrapped = true;
836 else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
838 frozenXID = dbform->datfrozenxid;
839 namecpy(&oldest_datname, &dbform->datname);
846 heap_close(relation, AccessShareLock);
849 * Do not truncate CLOG if we seem to have suffered wraparound
850 * already; the computed minimum XID might be bogus.
852 if (vacuumAlreadyWrapped)
855 (errmsg("some databases have not been vacuumed in over 2 billion transactions"),
856 errdetail("You may have already suffered transaction-wraparound data loss.")));
860 /* Truncate CLOG to the oldest vacuumxid */
861 TruncateCLOG(vacuumXID);
864 * Do not update varsup.c if we seem to have suffered wraparound
865 * already; the computed XID might be bogus.
867 if (frozenAlreadyWrapped)
870 (errmsg("some databases have not been vacuumed in over 1 billion transactions"),
871 errhint("Better vacuum them soon, or you may have a wraparound failure.")));
875 /* Update the wrap limit for GetNewTransactionId */
876 SetTransactionIdLimit(frozenXID, &oldest_datname);
878 /* Give warning about impending wraparound problems */
879 age = (int32) (myXID - frozenXID);
880 if (age > (int32) ((MaxTransactionId >> 3) * 3))
882 (errmsg("database \"%s\" must be vacuumed within %u transactions",
883 NameStr(oldest_datname),
884 (MaxTransactionId >> 1) - age),
885 errhint("To avoid a database shutdown, execute a full-database VACUUM in \"%s\".",
886 NameStr(oldest_datname))));
890 /****************************************************************************
892 * Code common to both flavors of VACUUM *
894 ****************************************************************************
899 * vacuum_rel() -- vacuum one heap relation
901 * Returns TRUE if we actually processed the relation (or can ignore it
902 * for some reason), FALSE if we failed to process it due to permissions
903 * or other reasons. (A FALSE result really means that some data
904 * may have been left unvacuumed, so we can't update XID stats.)
906 * Doing one heap at a time incurs extra overhead, since we need to
907 * check that the heap exists again just before we vacuum it. The
908 * reason that we do this is so that vacuuming can be spread across
909 * many small transactions. Otherwise, two-phase locking would require
910 * us to lock the entire database during one pass of the vacuum cleaner.
912 * At entry and exit, we are not inside a transaction.
915 vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
923 /* Begin a transaction for vacuuming this relation */
924 StartTransactionCommand();
925 /* functions in indexes may want a snapshot set */
926 ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
929 * Tell the cache replacement strategy that vacuum is causing all
932 StrategyHintVacuum(true);
935 * Check for user-requested abort. Note we want this to be inside a
936 * transaction, so xact.c doesn't issue useless WARNING.
938 CHECK_FOR_INTERRUPTS();
941 * Race condition -- if the pg_class tuple has gone away since the
942 * last time we saw it, we don't need to vacuum it.
944 if (!SearchSysCacheExists(RELOID,
945 ObjectIdGetDatum(relid),
948 StrategyHintVacuum(false);
949 CommitTransactionCommand();
950 return true; /* okay 'cause no data there */
954 * Determine the type of lock we want --- hard exclusive lock for a
955 * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
956 * vacuum. Either way, we can be sure that no other backend is
957 * vacuuming the same table.
959 lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;
962 * Open the class, get an appropriate lock on it, and check
965 * We allow the user to vacuum a table if he is superuser, the table
966 * owner, or the database owner (but in the latter case, only if it's
967 * not a shared relation). pg_class_ownercheck includes the superuser
970 * Note we choose to treat permissions failure as a WARNING and keep
971 * trying to vacuum the rest of the DB --- is this appropriate?
973 onerel = relation_open(relid, lmode);
975 if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
976 (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !onerel->rd_rel->relisshared)))
979 (errmsg("skipping \"%s\" --- only table or database owner can vacuum it",
980 RelationGetRelationName(onerel))));
981 relation_close(onerel, lmode);
982 StrategyHintVacuum(false);
983 CommitTransactionCommand();
988 * Check that it's a plain table; we used to do this in get_rel_oids()
989 * but seems safer to check after we've locked the relation.
991 if (onerel->rd_rel->relkind != expected_relkind)
994 (errmsg("skipping \"%s\" --- cannot vacuum indexes, views, or special system tables",
995 RelationGetRelationName(onerel))));
996 relation_close(onerel, lmode);
997 StrategyHintVacuum(false);
998 CommitTransactionCommand();
1003 * Silently ignore tables that are temp tables of other backends ---
1004 * trying to vacuum these will lead to great unhappiness, since their
1005 * contents are probably not up-to-date on disk. (We don't throw a
1006 * warning here; it would just lead to chatter during a database-wide
1009 if (isOtherTempNamespace(RelationGetNamespace(onerel)))
1011 relation_close(onerel, lmode);
1012 StrategyHintVacuum(false);
1013 CommitTransactionCommand();
1014 return true; /* assume no long-lived data in temp
1019 * Get a session-level lock too. This will protect our access to the
1020 * relation across multiple transactions, so that we can vacuum the
1021 * relation's TOAST table (if any) secure in the knowledge that no one
1022 * is deleting the parent relation.
1024 * NOTE: this cannot block, even if someone else is waiting for access,
1025 * because the lock manager knows that both lock requests are from the
1028 onerelid = onerel->rd_lockInfo.lockRelId;
1029 LockRelationForSession(&onerelid, onerel->rd_istemp, lmode);
1032 * Remember the relation's TOAST relation for later
1034 toast_relid = onerel->rd_rel->reltoastrelid;
1037 * Do the actual work --- either FULL or "lazy" vacuum
1040 full_vacuum_rel(onerel, vacstmt);
1042 lazy_vacuum_rel(onerel, vacstmt);
1044 result = true; /* did the vacuum */
1046 /* all done with this class, but hold lock until commit */
1047 relation_close(onerel, NoLock);
1050 * Complete the transaction and free all temporary memory used.
1052 StrategyHintVacuum(false);
1053 CommitTransactionCommand();
1056 * If the relation has a secondary toast rel, vacuum that too while we
1057 * still hold the session lock on the master table. Note however that
1058 * "analyze" will not get done on the toast table. This is good,
1059 * because the toaster always uses hardcoded index access and
1060 * statistics are totally unimportant for toast relations.
1062 if (toast_relid != InvalidOid)
1064 if (!vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE))
1065 result = false; /* failed to vacuum the TOAST table? */
1069 * Now release the session-level lock on the master table.
1071 UnlockRelationForSession(&onerelid, lmode);
1077 /****************************************************************************
1079 * Code for VACUUM FULL (only) *
1081 ****************************************************************************
1086 * full_vacuum_rel() -- perform FULL VACUUM for one heap relation
1088 * This routine vacuums a single heap, cleans out its indexes, and
1089 * updates its num_pages and num_tuples statistics.
1091 * At entry, we have already established a transaction and opened
1092 * and locked the relation.
1095 full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
1097 VacPageListData vacuum_pages; /* List of pages to vacuum and/or
1099 VacPageListData fraged_pages; /* List of pages with space enough
1104 VRelStats *vacrelstats;
1106 vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
1107 &OldestXmin, &FreezeLimit);
1110 * Set up statistics-gathering machinery.
1112 vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
1113 vacrelstats->rel_pages = 0;
1114 vacrelstats->rel_tuples = 0;
1115 vacrelstats->hasindex = false;
1118 vacuum_pages.num_pages = fraged_pages.num_pages = 0;
1119 scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
1121 /* Now open all indexes of the relation */
1122 vac_open_indexes(onerel, AccessExclusiveLock, &nindexes, &Irel);
1124 vacrelstats->hasindex = true;
1126 /* Clean/scan index relation(s) */
1129 if (vacuum_pages.num_pages > 0)
1131 for (i = 0; i < nindexes; i++)
1132 vacuum_index(&vacuum_pages, Irel[i],
1133 vacrelstats->rel_tuples, 0);
1137 /* just scan indexes to update statistic */
1138 for (i = 0; i < nindexes; i++)
1139 scan_index(Irel[i], vacrelstats->rel_tuples);
1143 if (fraged_pages.num_pages > 0)
1145 /* Try to shrink heap */
1146 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
1148 vac_close_indexes(nindexes, Irel, NoLock);
1152 vac_close_indexes(nindexes, Irel, NoLock);
1153 if (vacuum_pages.num_pages > 0)
1155 /* Clean pages from vacuum_pages list */
1156 vacuum_heap(vacrelstats, onerel, &vacuum_pages);
1160 /* update shared free space map with final free space info */
1161 vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
1163 /* update statistics in pg_class */
1164 vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
1165 vacrelstats->rel_tuples, vacrelstats->hasindex);
1167 /* report results to the stats collector, too */
1168 pgstat_report_vacuum(RelationGetRelid(onerel), vacstmt->analyze,
1169 vacrelstats->rel_tuples);
1174 * scan_heap() -- scan an open heap relation
1176 * This routine sets commit status bits, constructs vacuum_pages (list
1177 * of pages we need to compact free space on and/or clean indexes of
1178 * deleted tuples), constructs fraged_pages (list of pages with free
1179 * space that tuples could be moved into), and calculates statistics
1180 * on the number of live tuples in the heap.
1183 scan_heap(VRelStats *vacrelstats, Relation onerel,
1184 VacPageList vacuum_pages, VacPageList fraged_pages)
1186 BlockNumber nblocks,
1188 HeapTupleData tuple;
1191 BlockNumber empty_pages,
1199 Size min_tlen = MaxTupleSize;
1201 bool do_shrinking = true;
1202 VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
1203 int num_vtlinks = 0;
1204 int free_vtlinks = 100;
1207 vac_init_rusage(&ru0);
1209 relname = RelationGetRelationName(onerel);
1211 (errmsg("vacuuming \"%s.%s\"",
1212 get_namespace_name(RelationGetNamespace(onerel)),
1215 empty_pages = empty_end_pages = 0;
1216 num_tuples = tups_vacuumed = nkeep = nunused = 0;
1219 nblocks = RelationGetNumberOfBlocks(onerel);
1222 * We initially create each VacPage item in a maximal-sized workspace,
1223 * then copy the workspace into a just-large-enough copy.
1225 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1227 for (blkno = 0; blkno < nblocks; blkno++)
1234 OffsetNumber offnum,
1239 vacuum_delay_point();
1241 buf = ReadBuffer(onerel, blkno);
1242 page = BufferGetPage(buf);
1245 * We don't bother to do LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE)
1246 * because we assume that holding exclusive lock on the relation
1247 * will keep other backends from looking at the page.
1250 vacpage->blkno = blkno;
1251 vacpage->offsets_used = 0;
1252 vacpage->offsets_free = 0;
1254 if (PageIsNew(page))
1256 VacPage vacpagecopy;
1259 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
1261 PageInit(page, BufferGetPageSize(buf), 0);
1262 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1263 free_space += vacpage->free;
1266 vacpagecopy = copy_vac_page(vacpage);
1267 vpage_insert(vacuum_pages, vacpagecopy);
1268 vpage_insert(fraged_pages, vacpagecopy);
1273 if (PageIsEmpty(page))
1275 VacPage vacpagecopy;
1277 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1278 free_space += vacpage->free;
1281 vacpagecopy = copy_vac_page(vacpage);
1282 vpage_insert(vacuum_pages, vacpagecopy);
1283 vpage_insert(fraged_pages, vacpagecopy);
1290 maxoff = PageGetMaxOffsetNumber(page);
1291 for (offnum = FirstOffsetNumber;
1293 offnum = OffsetNumberNext(offnum))
1295 ItemId itemid = PageGetItemId(page, offnum);
1296 bool tupgone = false;
1299 * Collect un-used items too - it's possible to have indexes
1300 * pointing here after crash.
1302 if (!ItemIdIsUsed(itemid))
1304 vacpage->offsets[vacpage->offsets_free++] = offnum;
1309 tuple.t_datamcxt = NULL;
1310 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1311 tuple.t_len = ItemIdGetLength(itemid);
1312 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1314 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
1316 case HEAPTUPLE_DEAD:
1317 tupgone = true; /* we can delete the tuple */
1319 case HEAPTUPLE_LIVE:
1322 * Tuple is good. Consider whether to replace its
1323 * xmin value with FrozenTransactionId.
1325 if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
1326 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1329 HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
1330 /* infomask should be okay already */
1331 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1338 if (onerel->rd_rel->relhasoids &&
1339 !OidIsValid(HeapTupleGetOid(&tuple)))
1340 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
1341 relname, blkno, offnum);
1343 case HEAPTUPLE_RECENTLY_DEAD:
1346 * If tuple is recently deleted then we must not
1347 * remove it from relation.
1352 * If we do shrinking and this tuple is updated one
1353 * then remember it to construct updated tuple
1357 !(ItemPointerEquals(&(tuple.t_self),
1358 &(tuple.t_data->t_ctid))))
1360 if (free_vtlinks == 0)
1362 free_vtlinks = 1000;
1363 vtlinks = (VTupleLink) repalloc(vtlinks,
1364 (free_vtlinks + num_vtlinks) *
1365 sizeof(VTupleLinkData));
1367 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
1368 vtlinks[num_vtlinks].this_tid = tuple.t_self;
1373 case HEAPTUPLE_INSERT_IN_PROGRESS:
1376 * This should not happen, since we hold exclusive
1377 * lock on the relation; shouldn't we raise an error?
1378 * (Actually, it can happen in system catalogs, since
1379 * we tend to release write lock before commit there.)
1382 (errmsg("relation \"%s\" TID %u/%u: InsertTransactionInProgress %u --- can't shrink relation",
1383 relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data))));
1384 do_shrinking = false;
1386 case HEAPTUPLE_DELETE_IN_PROGRESS:
1389 * This should not happen, since we hold exclusive
1390 * lock on the relation; shouldn't we raise an error?
1391 * (Actually, it can happen in system catalogs, since
1392 * we tend to release write lock before commit there.)
1395 (errmsg("relation \"%s\" TID %u/%u: DeleteTransactionInProgress %u --- can't shrink relation",
1396 relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data))));
1397 do_shrinking = false;
1400 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1409 * Here we are building a temporary copy of the page with
1410 * dead tuples removed. Below we will apply
1411 * PageRepairFragmentation to the copy, so that we can
1412 * determine how much space will be available after
1413 * removal of dead tuples. But note we are NOT changing
1414 * the real page yet...
1416 if (tempPage == NULL)
1420 pageSize = PageGetPageSize(page);
1421 tempPage = (Page) palloc(pageSize);
1422 memcpy(tempPage, page, pageSize);
1425 /* mark it unused on the temp page */
1426 lpp = PageGetItemId(tempPage, offnum);
1427 lpp->lp_flags &= ~LP_USED;
1429 vacpage->offsets[vacpage->offsets_free++] = offnum;
1436 if (tuple.t_len < min_tlen)
1437 min_tlen = tuple.t_len;
1438 if (tuple.t_len > max_tlen)
1439 max_tlen = tuple.t_len;
1441 } /* scan along page */
1443 if (tempPage != NULL)
1445 /* Some tuples are removable; figure free space after removal */
1446 PageRepairFragmentation(tempPage, NULL);
1447 vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1453 /* Just use current available space */
1454 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1455 /* Need to reap the page if it has ~LP_USED line pointers */
1456 do_reap = (vacpage->offsets_free > 0);
1459 free_space += vacpage->free;
1462 * Add the page to fraged_pages if it has a useful amount of free
1463 * space. "Useful" means enough for a minimal-sized tuple. But we
1464 * don't know that accurately near the start of the relation, so
1465 * add pages unconditionally if they have >= BLCKSZ/10 free space.
1467 do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1469 if (do_reap || do_frag)
1471 VacPage vacpagecopy = copy_vac_page(vacpage);
1474 vpage_insert(vacuum_pages, vacpagecopy);
1476 vpage_insert(fraged_pages, vacpagecopy);
1480 * Include the page in empty_end_pages if it will be empty after
1481 * vacuuming; this is to keep us from using it as a move
1490 empty_end_pages = 0;
1500 /* save stats in the rel list for use later */
1501 vacrelstats->rel_tuples = num_tuples;
1502 vacrelstats->rel_pages = nblocks;
1503 if (num_tuples == 0)
1504 min_tlen = max_tlen = 0;
1505 vacrelstats->min_tlen = min_tlen;
1506 vacrelstats->max_tlen = max_tlen;
1508 vacuum_pages->empty_end_pages = empty_end_pages;
1509 fraged_pages->empty_end_pages = empty_end_pages;
1512 * Clear the fraged_pages list if we found we couldn't shrink. Else,
1513 * remove any "empty" end-pages from the list, and compute usable free
1514 * space = free space in remaining pages.
1520 Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
1521 fraged_pages->num_pages -= empty_end_pages;
1522 usable_free_space = 0;
1523 for (i = 0; i < fraged_pages->num_pages; i++)
1524 usable_free_space += fraged_pages->pagedesc[i]->free;
1528 fraged_pages->num_pages = 0;
1529 usable_free_space = 0;
1532 /* don't bother to save vtlinks if we will not call repair_frag */
1533 if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
1535 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
1537 vacrelstats->vtlinks = vtlinks;
1538 vacrelstats->num_vtlinks = num_vtlinks;
1542 vacrelstats->vtlinks = NULL;
1543 vacrelstats->num_vtlinks = 0;
1548 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
1549 RelationGetRelationName(onerel),
1550 tups_vacuumed, num_tuples, nblocks),
1551 errdetail("%.0f dead row versions cannot be removed yet.\n"
1552 "Nonremovable row versions range from %lu to %lu bytes long.\n"
1553 "There were %.0f unused item pointers.\n"
1554 "Total free space (including removable row versions) is %.0f bytes.\n"
1555 "%u pages are or will become empty, including %u at the end of the table.\n"
1556 "%u pages containing %.0f free bytes are potential move destinations.\n"
1559 (unsigned long) min_tlen, (unsigned long) max_tlen,
1562 empty_pages, empty_end_pages,
1563 fraged_pages->num_pages, usable_free_space,
1564 vac_show_rusage(&ru0))));
1569 * repair_frag() -- try to repair relation's fragmentation
1571 * This routine marks dead tuples as unused and tries re-use dead space
1572 * by moving tuples (and inserting indexes if needed). It constructs
1573 * Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1574 * for them after committing (in hack-manner - without losing locks
1575 * and freeing memory!) current transaction. It truncates relation
1576 * if some end-blocks are gone away.
1579 repair_frag(VRelStats *vacrelstats, Relation onerel,
1580 VacPageList vacuum_pages, VacPageList fraged_pages,
1581 int nindexes, Relation *Irel)
1583 TransactionId myXID = GetCurrentTransactionId();
1584 Buffer dst_buffer = InvalidBuffer;
1585 BlockNumber nblocks,
1587 BlockNumber last_move_dest_block = 0,
1589 Page dst_page = NULL;
1591 VacPageListData Nvacpagelist;
1592 VacPage dst_vacpage = NULL,
1600 int keep_tuples = 0;
1603 vac_init_rusage(&ru0);
1605 ExecContext_Init(&ec, onerel);
1607 Nvacpagelist.num_pages = 0;
1608 num_fraged_pages = fraged_pages->num_pages;
1609 Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
1610 vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1611 if (vacuumed_pages > 0)
1613 /* get last reaped page from vacuum_pages */
1614 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1615 last_vacuum_block = last_vacuum_page->blkno;
1619 last_vacuum_page = NULL;
1620 last_vacuum_block = InvalidBlockNumber;
1623 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1624 vacpage->offsets_used = vacpage->offsets_free = 0;
1627 * Scan pages backwards from the last nonempty page, trying to move
1628 * tuples down to lower pages. Quit when we reach a page that we have
1629 * moved any tuples onto, or the first page if we haven't moved
1630 * anything, or when we find a page we cannot completely empty (this
1631 * last condition is handled by "break" statements within the loop).
1633 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1634 * in order by blkno.
1636 nblocks = vacrelstats->rel_pages;
1637 for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1638 blkno > last_move_dest_block;
1643 OffsetNumber offnum,
1649 vacuum_delay_point();
1652 * Forget fraged_pages pages at or after this one; they're no
1653 * longer useful as move targets, since we only want to move down.
1654 * Note that since we stop the outer loop at last_move_dest_block,
1655 * pages removed here cannot have had anything moved onto them
1658 * Also note that we don't change the stored fraged_pages list, only
1659 * our local variable num_fraged_pages; so the forgotten pages are
1660 * still available to be loaded into the free space map later.
1662 while (num_fraged_pages > 0 &&
1663 fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1665 Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1670 * Process this page of relation.
1672 buf = ReadBuffer(onerel, blkno);
1673 page = BufferGetPage(buf);
1675 vacpage->offsets_free = 0;
1677 isempty = PageIsEmpty(page);
1681 /* Is the page in the vacuum_pages list? */
1682 if (blkno == last_vacuum_block)
1684 if (last_vacuum_page->offsets_free > 0)
1686 /* there are dead tuples on this page - clean them */
1688 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1689 vacuum_page(onerel, buf, last_vacuum_page);
1690 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1696 if (vacuumed_pages > 0)
1698 /* get prev reaped page from vacuum_pages */
1699 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1700 last_vacuum_block = last_vacuum_page->blkno;
1704 last_vacuum_page = NULL;
1705 last_vacuum_block = InvalidBlockNumber;
1716 chain_tuple_moved = false; /* no one chain-tuple was moved
1717 * off this page, yet */
1718 vacpage->blkno = blkno;
1719 maxoff = PageGetMaxOffsetNumber(page);
1720 for (offnum = FirstOffsetNumber;
1722 offnum = OffsetNumberNext(offnum))
1725 HeapTupleData tuple;
1726 ItemId itemid = PageGetItemId(page, offnum);
1728 if (!ItemIdIsUsed(itemid))
1731 tuple.t_datamcxt = NULL;
1732 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1733 tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1734 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1737 * VACUUM FULL has an exclusive lock on the relation. So
1738 * normally no other transaction can have pending INSERTs or
1739 * DELETEs in this relation. A tuple is either:
1740 * (a) a tuple in a system catalog, inserted or deleted
1741 * by a not yet committed transaction
1742 * (b) known dead (XMIN_INVALID, or XMAX_COMMITTED and xmax
1743 * is visible to all active transactions)
1744 * (c) inserted by a committed xact (XMIN_COMMITTED)
1745 * (d) moved by the currently running VACUUM.
1746 * (e) deleted (XMAX_COMMITTED) but at least one active
1747 * transaction does not see the deleting transaction
1748 * In case (a) we wouldn't be in repair_frag() at all.
1749 * In case (b) we cannot be here, because scan_heap() has
1750 * already marked the item as unused, see continue above. Case
1751 * (c) is what normally is to be expected. Case (d) is only
1752 * possible, if a whole tuple chain has been moved while
1753 * processing this or a higher numbered block.
1756 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1758 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1759 elog(ERROR, "HEAP_MOVED_IN was not expected");
1760 if (!(tuple.t_data->t_infomask & HEAP_MOVED_OFF))
1761 elog(ERROR, "HEAP_MOVED_OFF was expected");
1764 * MOVED_OFF by another VACUUM would have caused the
1765 * visibility check to set XMIN_COMMITTED or XMIN_INVALID.
1767 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
1768 elog(ERROR, "invalid XVAC in tuple header");
1771 * If this (chain) tuple is moved by me already then I
1772 * have to check is it in vacpage or not - i.e. is it
1773 * moved while cleaning this page or some previous one.
1776 /* Can't we Assert(keep_tuples > 0) here? */
1777 if (keep_tuples == 0)
1779 if (chain_tuple_moved)
1781 /* some chains were moved while cleaning this page */
1782 Assert(vacpage->offsets_free > 0);
1783 for (i = 0; i < vacpage->offsets_free; i++)
1785 if (vacpage->offsets[i] == offnum)
1788 if (i >= vacpage->offsets_free) /* not found */
1790 vacpage->offsets[vacpage->offsets_free++] = offnum;
1796 vacpage->offsets[vacpage->offsets_free++] = offnum;
1803 * If this tuple is in the chain of tuples created in updates
1804 * by "recent" transactions then we have to move all chain of
1805 * tuples to another places.
1807 * NOTE: this test is not 100% accurate: it is possible for a
1808 * tuple to be an updated one with recent xmin, and yet not
1809 * have a corresponding tuple in the vtlinks list. Presumably
1810 * there was once a parent tuple with xmax matching the xmin,
1811 * but it's possible that that tuple has been removed --- for
1812 * example, if it had xmin = xmax then
1813 * HeapTupleSatisfiesVacuum would deem it removable as soon as
1814 * the xmin xact completes.
1816 * To be on the safe side, we abandon the repair_frag process if
1817 * we cannot find the parent tuple in vtlinks. This may be
1818 * overly conservative; AFAICS it would be safe to move the
1821 if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
1822 !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1824 (!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
1826 !(ItemPointerEquals(&(tuple.t_self),
1827 &(tuple.t_data->t_ctid)))))
1830 bool freeCbuf = false;
1831 bool chain_move_failed = false;
1832 ItemPointerData Ctid;
1833 HeapTupleData tp = tuple;
1834 Size tlen = tuple_len;
1838 VacPage to_vacpage = NULL;
1842 if (dst_buffer != InvalidBuffer)
1844 WriteBuffer(dst_buffer);
1845 dst_buffer = InvalidBuffer;
1848 /* Quick exit if we have no vtlinks to search in */
1849 if (vacrelstats->vtlinks == NULL)
1851 elog(DEBUG2, "parent item in update-chain not found --- can't continue repair_frag");
1852 break; /* out of walk-along-page loop */
1855 vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
1860 * If this tuple is in the begin/middle of the chain then
1861 * we have to move to the end of chain.
1863 while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
1865 !(ItemPointerEquals(&(tp.t_self),
1866 &(tp.t_data->t_ctid))))
1870 ItemPointerData Ctid;
1872 Ctid = tp.t_data->t_ctid;
1874 ReleaseBuffer(Cbuf);
1876 Cbuf = ReadBuffer(onerel,
1877 ItemPointerGetBlockNumber(&Ctid));
1878 Cpage = BufferGetPage(Cbuf);
1879 Citemid = PageGetItemId(Cpage,
1880 ItemPointerGetOffsetNumber(&Ctid));
1881 if (!ItemIdIsUsed(Citemid))
1884 * This means that in the middle of chain there
1885 * was tuple updated by older (than OldestXmin)
1886 * xaction and this tuple is already deleted by
1887 * me. Actually, upper part of chain should be
1888 * removed and seems that this should be handled
1889 * in scan_heap(), but it's not implemented at the
1890 * moment and so we just stop shrinking here.
1892 elog(DEBUG2, "child itemid in update-chain marked as unused --- can't continue repair_frag");
1893 chain_move_failed = true;
1894 break; /* out of loop to move to chain end */
1896 tp.t_datamcxt = NULL;
1897 tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1899 tlen = tp.t_len = ItemIdGetLength(Citemid);
1901 if (chain_move_failed)
1904 ReleaseBuffer(Cbuf);
1906 break; /* out of walk-along-page loop */
1910 * Check if all items in chain can be moved
1918 VTupleLinkData vtld,
1921 if (to_vacpage == NULL ||
1922 !enough_space(to_vacpage, tlen))
1924 for (i = 0; i < num_fraged_pages; i++)
1926 if (enough_space(fraged_pages->pagedesc[i], tlen))
1930 if (i == num_fraged_pages)
1932 /* can't move item anywhere */
1933 chain_move_failed = true;
1934 break; /* out of check-all-items loop */
1937 to_vacpage = fraged_pages->pagedesc[to_item];
1939 to_vacpage->free -= MAXALIGN(tlen);
1940 if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1941 to_vacpage->free -= sizeof(ItemIdData);
1942 (to_vacpage->offsets_used)++;
1943 if (free_vtmove == 0)
1946 vtmove = (VTupleMove)
1948 (free_vtmove + num_vtmove) *
1949 sizeof(VTupleMoveData));
1951 vtmove[num_vtmove].tid = tp.t_self;
1952 vtmove[num_vtmove].vacpage = to_vacpage;
1953 if (to_vacpage->offsets_used == 1)
1954 vtmove[num_vtmove].cleanVpd = true;
1956 vtmove[num_vtmove].cleanVpd = false;
1960 /* At beginning of chain? */
1961 if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1962 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
1966 /* No, move to tuple with prior row version */
1967 vtld.new_tid = tp.t_self;
1969 vac_bsearch((void *) &vtld,
1970 (void *) (vacrelstats->vtlinks),
1971 vacrelstats->num_vtlinks,
1972 sizeof(VTupleLinkData),
1976 /* see discussion above */
1977 elog(DEBUG2, "parent item in update-chain not found --- can't continue repair_frag");
1978 chain_move_failed = true;
1979 break; /* out of check-all-items loop */
1981 tp.t_self = vtlp->this_tid;
1982 Pbuf = ReadBuffer(onerel,
1983 ItemPointerGetBlockNumber(&(tp.t_self)));
1984 Ppage = BufferGetPage(Pbuf);
1985 Pitemid = PageGetItemId(Ppage,
1986 ItemPointerGetOffsetNumber(&(tp.t_self)));
1987 /* this can't happen since we saw tuple earlier: */
1988 if (!ItemIdIsUsed(Pitemid))
1989 elog(ERROR, "parent itemid marked as unused");
1990 Ptp.t_datamcxt = NULL;
1991 Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1993 /* ctid should not have changed since we saved it */
1994 Assert(ItemPointerEquals(&(vtld.new_tid),
1995 &(Ptp.t_data->t_ctid)));
1998 * Read above about cases when !ItemIdIsUsed(Citemid)
1999 * (child item is removed)... Due to the fact that at
2000 * the moment we don't remove unuseful part of
2001 * update-chain, it's possible to get too old parent
2002 * row here. Like as in the case which caused this
2003 * problem, we stop shrinking here. I could try to
2004 * find real parent row but want not to do it because
2005 * of real solution will be implemented anyway, later,
2006 * and we are too close to 6.5 release. - vadim
2009 if (Ptp.t_data->t_infomask & HEAP_XMAX_IS_MULTI ||
2010 !(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
2011 HeapTupleHeaderGetXmin(tp.t_data))))
2013 ReleaseBuffer(Pbuf);
2014 elog(DEBUG2, "too old parent tuple found --- can't continue repair_frag");
2015 chain_move_failed = true;
2016 break; /* out of check-all-items loop */
2018 tp.t_datamcxt = Ptp.t_datamcxt;
2019 tp.t_data = Ptp.t_data;
2020 tlen = tp.t_len = ItemIdGetLength(Pitemid);
2022 ReleaseBuffer(Cbuf);
2025 } /* end of check-all-items loop */
2028 ReleaseBuffer(Cbuf);
2031 if (chain_move_failed)
2034 * Undo changes to offsets_used state. We don't
2035 * bother cleaning up the amount-free state, since
2036 * we're not going to do any further tuple motion.
2038 for (i = 0; i < num_vtmove; i++)
2040 Assert(vtmove[i].vacpage->offsets_used > 0);
2041 (vtmove[i].vacpage->offsets_used)--;
2044 break; /* out of walk-along-page loop */
2048 * Okay, move the whole tuple chain
2050 ItemPointerSetInvalid(&Ctid);
2051 for (ti = 0; ti < num_vtmove; ti++)
2053 VacPage destvacpage = vtmove[ti].vacpage;
2057 /* Get page to move from */
2058 tuple.t_self = vtmove[ti].tid;
2059 Cbuf = ReadBuffer(onerel,
2060 ItemPointerGetBlockNumber(&(tuple.t_self)));
2062 /* Get page to move to */
2063 dst_buffer = ReadBuffer(onerel, destvacpage->blkno);
2065 LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
2066 if (dst_buffer != Cbuf)
2067 LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
2069 dst_page = BufferGetPage(dst_buffer);
2070 Cpage = BufferGetPage(Cbuf);
2072 Citemid = PageGetItemId(Cpage,
2073 ItemPointerGetOffsetNumber(&(tuple.t_self)));
2074 tuple.t_datamcxt = NULL;
2075 tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
2076 tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
2079 * make a copy of the source tuple, and then mark the
2080 * source tuple MOVED_OFF.
2082 move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
2083 dst_buffer, dst_page, destvacpage,
2084 &ec, &Ctid, vtmove[ti].cleanVpd);
2087 if (destvacpage->blkno > last_move_dest_block)
2088 last_move_dest_block = destvacpage->blkno;
2091 * Remember that we moved tuple from the current page
2092 * (corresponding index tuple will be cleaned).
2095 vacpage->offsets[vacpage->offsets_free++] =
2096 ItemPointerGetOffsetNumber(&(tuple.t_self));
2100 WriteBuffer(dst_buffer);
2102 } /* end of move-the-tuple-chain loop */
2104 dst_buffer = InvalidBuffer;
2106 chain_tuple_moved = true;
2108 /* advance to next tuple in walk-along-page loop */
2110 } /* end of is-tuple-in-chain test */
2112 /* try to find new page for this tuple */
2113 if (dst_buffer == InvalidBuffer ||
2114 !enough_space(dst_vacpage, tuple_len))
2116 if (dst_buffer != InvalidBuffer)
2118 WriteBuffer(dst_buffer);
2119 dst_buffer = InvalidBuffer;
2121 for (i = 0; i < num_fraged_pages; i++)
2123 if (enough_space(fraged_pages->pagedesc[i], tuple_len))
2126 if (i == num_fraged_pages)
2127 break; /* can't move item anywhere */
2128 dst_vacpage = fraged_pages->pagedesc[i];
2129 dst_buffer = ReadBuffer(onerel, dst_vacpage->blkno);
2130 LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
2131 dst_page = BufferGetPage(dst_buffer);
2132 /* if this page was not used before - clean it */
2133 if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
2134 vacuum_page(onerel, dst_buffer, dst_vacpage);
2137 LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
2139 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2141 move_plain_tuple(onerel, buf, page, &tuple,
2142 dst_buffer, dst_page, dst_vacpage, &ec);
2146 if (dst_vacpage->blkno > last_move_dest_block)
2147 last_move_dest_block = dst_vacpage->blkno;
2150 * Remember that we moved tuple from the current page
2151 * (corresponding index tuple will be cleaned).
2153 vacpage->offsets[vacpage->offsets_free++] = offnum;
2154 } /* walk along page */
2157 * If we broke out of the walk-along-page loop early (ie, still
2158 * have offnum <= maxoff), then we failed to move some tuple off
2159 * this page. No point in shrinking any more, so clean up and
2160 * exit the per-page loop.
2162 if (offnum < maxoff && keep_tuples > 0)
2167 * Fix vacpage state for any unvisited tuples remaining on
2170 for (off = OffsetNumberNext(offnum);
2172 off = OffsetNumberNext(off))
2174 ItemId itemid = PageGetItemId(page, off);
2175 HeapTupleHeader htup;
2177 if (!ItemIdIsUsed(itemid))
2179 htup = (HeapTupleHeader) PageGetItem(page, itemid);
2180 if (htup->t_infomask & HEAP_XMIN_COMMITTED)
2184 * See comments in the walk-along-page loop above about
2185 * why only MOVED_OFF tuples should be found here.
2187 if (htup->t_infomask & HEAP_MOVED_IN)
2188 elog(ERROR, "HEAP_MOVED_IN was not expected");
2189 if (!(htup->t_infomask & HEAP_MOVED_OFF))
2190 elog(ERROR, "HEAP_MOVED_OFF was expected");
2191 if (HeapTupleHeaderGetXvac(htup) != myXID)
2192 elog(ERROR, "invalid XVAC in tuple header");
2194 if (chain_tuple_moved)
2196 /* some chains were moved while cleaning this page */
2197 Assert(vacpage->offsets_free > 0);
2198 for (i = 0; i < vacpage->offsets_free; i++)
2200 if (vacpage->offsets[i] == off)
2203 if (i >= vacpage->offsets_free) /* not found */
2205 vacpage->offsets[vacpage->offsets_free++] = off;
2206 Assert(keep_tuples > 0);
2212 vacpage->offsets[vacpage->offsets_free++] = off;
2213 Assert(keep_tuples > 0);
2219 if (vacpage->offsets_free > 0) /* some tuples were moved */
2221 if (chain_tuple_moved) /* else - they are ordered */
2223 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
2224 sizeof(OffsetNumber), vac_cmp_offno);
2226 vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2234 if (offnum <= maxoff)
2235 break; /* had to quit early, see above note */
2237 } /* walk along relation */
2239 blkno++; /* new number of blocks */
2241 if (dst_buffer != InvalidBuffer)
2243 Assert(num_moved > 0);
2244 WriteBuffer(dst_buffer);
2250 * We have to commit our tuple movings before we truncate the
2251 * relation. Ideally we should do Commit/StartTransactionCommand
2252 * here, relying on the session-level table lock to protect our
2253 * exclusive access to the relation. However, that would require
2254 * a lot of extra code to close and re-open the relation, indexes,
2255 * etc. For now, a quick hack: record status of current
2256 * transaction as committed, and continue.
2258 RecordTransactionCommit();
2262 * We are not going to move any more tuples across pages, but we still
2263 * need to apply vacuum_page to compact free space in the remaining
2264 * pages in vacuum_pages list. Note that some of these pages may also
2265 * be in the fraged_pages list, and may have had tuples moved onto
2266 * them; if so, we already did vacuum_page and needn't do it again.
2268 for (i = 0, curpage = vacuum_pages->pagedesc;
2272 vacuum_delay_point();
2274 Assert((*curpage)->blkno < blkno);
2275 if ((*curpage)->offsets_used == 0)
2280 /* this page was not used as a move target, so must clean it */
2281 buf = ReadBuffer(onerel, (*curpage)->blkno);
2282 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2283 page = BufferGetPage(buf);
2284 if (!PageIsEmpty(page))
2285 vacuum_page(onerel, buf, *curpage);
2286 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2292 * Now scan all the pages that we moved tuples onto and update tuple
2293 * status bits. This is not really necessary, but will save time for
2294 * future transactions examining these tuples.
2296 update_hint_bits(onerel, fraged_pages, num_fraged_pages,
2297 last_move_dest_block, num_moved);
2300 * It'd be cleaner to make this report at the bottom of this routine,
2301 * but then the rusage would double-count the second pass of index
2302 * vacuuming. So do it here and ignore the relatively small amount of
2303 * processing that occurs below.
2306 (errmsg("\"%s\": moved %u row versions, truncated %u to %u pages",
2307 RelationGetRelationName(onerel),
2308 num_moved, nblocks, blkno),
2310 vac_show_rusage(&ru0))));
2313 * Reflect the motion of system tuples to catalog cache here.
2315 CommandCounterIncrement();
2317 if (Nvacpagelist.num_pages > 0)
2319 /* vacuum indexes again if needed */
2326 /* re-sort Nvacpagelist.pagedesc */
2327 for (vpleft = Nvacpagelist.pagedesc,
2328 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2329 vpleft < vpright; vpleft++, vpright--)
2337 * keep_tuples is the number of tuples that have been moved
2338 * off a page during chain moves but not been scanned over
2339 * subsequently. The tuple ids of these tuples are not
2340 * recorded as free offsets for any VacPage, so they will not
2341 * be cleared from the indexes.
2343 Assert(keep_tuples >= 0);
2344 for (i = 0; i < nindexes; i++)
2345 vacuum_index(&Nvacpagelist, Irel[i],
2346 vacrelstats->rel_tuples, keep_tuples);
2350 * Clean moved-off tuples from last page in Nvacpagelist list.
2352 * We need only do this in this one page, because higher-numbered
2353 * pages are going to be truncated from the relation entirely.
2354 * But see comments for update_hint_bits().
2356 if (vacpage->blkno == (blkno - 1) &&
2357 vacpage->offsets_free > 0)
2361 OffsetNumber unused[BLCKSZ / sizeof(OffsetNumber)];
2362 OffsetNumber offnum,
2367 buf = ReadBuffer(onerel, vacpage->blkno);
2368 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2369 page = BufferGetPage(buf);
2370 maxoff = PageGetMaxOffsetNumber(page);
2371 for (offnum = FirstOffsetNumber;
2373 offnum = OffsetNumberNext(offnum))
2375 ItemId itemid = PageGetItemId(page, offnum);
2376 HeapTupleHeader htup;
2378 if (!ItemIdIsUsed(itemid))
2380 htup = (HeapTupleHeader) PageGetItem(page, itemid);
2381 if (htup->t_infomask & HEAP_XMIN_COMMITTED)
2385 * See comments in the walk-along-page loop above about
2386 * why only MOVED_OFF tuples should be found here.
2388 if (htup->t_infomask & HEAP_MOVED_IN)
2389 elog(ERROR, "HEAP_MOVED_IN was not expected");
2390 if (!(htup->t_infomask & HEAP_MOVED_OFF))
2391 elog(ERROR, "HEAP_MOVED_OFF was expected");
2392 if (HeapTupleHeaderGetXvac(htup) != myXID)
2393 elog(ERROR, "invalid XVAC in tuple header");
2395 itemid->lp_flags &= ~LP_USED;
2398 Assert(vacpage->offsets_free == num_tuples);
2400 START_CRIT_SECTION();
2402 uncnt = PageRepairFragmentation(page, unused);
2405 if (!onerel->rd_istemp)
2409 recptr = log_heap_clean(onerel, buf, unused, uncnt);
2410 PageSetLSN(page, recptr);
2411 PageSetTLI(page, ThisTimeLineID);
2416 * No XLOG record, but still need to flag that XID exists
2419 MyXactMadeTempRelUpdate = true;
2424 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2428 /* now - free new list of reaped pages */
2429 curpage = Nvacpagelist.pagedesc;
2430 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
2432 pfree(Nvacpagelist.pagedesc);
2435 /* Truncate relation, if needed */
2436 if (blkno < nblocks)
2438 RelationTruncate(onerel, blkno);
2439 vacrelstats->rel_pages = blkno; /* set new number of blocks */
2444 if (vacrelstats->vtlinks != NULL)
2445 pfree(vacrelstats->vtlinks);
2447 ExecContext_Finish(&ec);
2451 * move_chain_tuple() -- move one tuple that is part of a tuple chain
2453 * This routine moves old_tup from old_page to dst_page.
2454 * old_page and dst_page might be the same page.
2455 * On entry old_buf and dst_buf are locked exclusively, both locks (or
2456 * the single lock, if this is a intra-page-move) are released before
2459 * Yes, a routine with ten parameters is ugly, but it's still better
2460 * than having these 120 lines of code in repair_frag() which is
2461 * already too long and almost unreadable.
2464 move_chain_tuple(Relation rel,
2465 Buffer old_buf, Page old_page, HeapTuple old_tup,
2466 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
2467 ExecContext ec, ItemPointer ctid, bool cleanVpd)
2469 TransactionId myXID = GetCurrentTransactionId();
2470 HeapTupleData newtup;
2471 OffsetNumber newoff;
2473 Size tuple_len = old_tup->t_len;
2475 heap_copytuple_with_tuple(old_tup, &newtup);
2478 * register invalidation of source tuple in catcaches.
2480 CacheInvalidateHeapTuple(rel, old_tup);
2482 /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
2483 START_CRIT_SECTION();
2485 old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2488 old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
2489 HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
2492 * If this page was not used before - clean it.
2494 * NOTE: a nasty bug used to lurk here. It is possible for the source
2495 * and destination pages to be the same (since this tuple-chain member
2496 * can be on a page lower than the one we're currently processing in
2497 * the outer loop). If that's true, then after vacuum_page() the
2498 * source tuple will have been moved, and tuple.t_data will be
2499 * pointing at garbage. Therefore we must do everything that uses
2500 * old_tup->t_data BEFORE this step!!
2502 * This path is different from the other callers of vacuum_page, because
2503 * we have already incremented the vacpage's offsets_used field to
2504 * account for the tuple(s) we expect to move onto the page. Therefore
2505 * vacuum_page's check for offsets_used == 0 is wrong. But since
2506 * that's a good debugging check for all other callers, we work around
2507 * it here rather than remove it.
2509 if (!PageIsEmpty(dst_page) && cleanVpd)
2511 int sv_offsets_used = dst_vacpage->offsets_used;
2513 dst_vacpage->offsets_used = 0;
2514 vacuum_page(rel, dst_buf, dst_vacpage);
2515 dst_vacpage->offsets_used = sv_offsets_used;
2519 * Update the state of the copied tuple, and store it on the
2522 newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2525 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2526 HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2527 newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
2528 InvalidOffsetNumber, LP_USED);
2529 if (newoff == InvalidOffsetNumber)
2531 elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
2532 (unsigned long) tuple_len, dst_vacpage->blkno);
2534 newitemid = PageGetItemId(dst_page, newoff);
2535 pfree(newtup.t_data);
2536 newtup.t_datamcxt = NULL;
2537 newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
2538 ItemPointerSet(&(newtup.t_self), dst_vacpage->blkno, newoff);
2541 if (!rel->rd_istemp)
2543 XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
2546 if (old_buf != dst_buf)
2548 PageSetLSN(old_page, recptr);
2549 PageSetTLI(old_page, ThisTimeLineID);
2551 PageSetLSN(dst_page, recptr);
2552 PageSetTLI(dst_page, ThisTimeLineID);
2557 * No XLOG record, but still need to flag that XID exists on disk
2559 MyXactMadeTempRelUpdate = true;
2565 * Set new tuple's t_ctid pointing to itself for last tuple in chain,
2566 * and to next tuple in chain otherwise.
2568 /* Is this ok after log_heap_move() and END_CRIT_SECTION()? */
2569 if (!ItemPointerIsValid(ctid))
2570 newtup.t_data->t_ctid = newtup.t_self;
2572 newtup.t_data->t_ctid = *ctid;
2573 *ctid = newtup.t_self;
2575 LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
2576 if (dst_buf != old_buf)
2577 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
2579 /* Create index entries for the moved tuple */
2580 if (ec->resultRelInfo->ri_NumIndices > 0)
2582 ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
2583 ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
2584 ResetPerTupleExprContext(ec->estate);
2589 * move_plain_tuple() -- move one tuple that is not part of a chain
2591 * This routine moves old_tup from old_page to dst_page.
2592 * On entry old_buf and dst_buf are locked exclusively, both locks are
2593 * released before exit.
2595 * Yes, a routine with eight parameters is ugly, but it's still better
2596 * than having these 90 lines of code in repair_frag() which is already
2597 * too long and almost unreadable.
2600 move_plain_tuple(Relation rel,
2601 Buffer old_buf, Page old_page, HeapTuple old_tup,
2602 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
2605 TransactionId myXID = GetCurrentTransactionId();
2606 HeapTupleData newtup;
2607 OffsetNumber newoff;
2609 Size tuple_len = old_tup->t_len;
2612 heap_copytuple_with_tuple(old_tup, &newtup);
2615 * register invalidation of source tuple in catcaches.
2617 * (Note: we do not need to register the copied tuple, because we are not
2618 * changing the tuple contents and so there cannot be any need to
2619 * flush negative catcache entries.)
2621 CacheInvalidateHeapTuple(rel, old_tup);
2623 /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
2624 START_CRIT_SECTION();
2627 * Mark new tuple as MOVED_IN by me.
2629 newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2632 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2633 HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2635 /* add tuple to the page */
2636 newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
2637 InvalidOffsetNumber, LP_USED);
2638 if (newoff == InvalidOffsetNumber)
2640 elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2641 (unsigned long) tuple_len,
2642 dst_vacpage->blkno, (unsigned long) dst_vacpage->free,
2643 dst_vacpage->offsets_used, dst_vacpage->offsets_free);
2645 newitemid = PageGetItemId(dst_page, newoff);
2646 pfree(newtup.t_data);
2647 newtup.t_datamcxt = NULL;
2648 newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
2649 ItemPointerSet(&(newtup.t_data->t_ctid), dst_vacpage->blkno, newoff);
2650 newtup.t_self = newtup.t_data->t_ctid;
2653 * Mark old tuple as MOVED_OFF by me.
2655 old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2658 old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
2659 HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
2662 if (!rel->rd_istemp)
2664 XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
2667 PageSetLSN(old_page, recptr);
2668 PageSetTLI(old_page, ThisTimeLineID);
2669 PageSetLSN(dst_page, recptr);
2670 PageSetTLI(dst_page, ThisTimeLineID);
2675 * No XLOG record, but still need to flag that XID exists on disk
2677 MyXactMadeTempRelUpdate = true;
2682 dst_vacpage->free = ((PageHeader) dst_page)->pd_upper -
2683 ((PageHeader) dst_page)->pd_lower;
2684 LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
2685 LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
2687 dst_vacpage->offsets_used++;
2689 /* insert index' tuples if needed */
2690 if (ec->resultRelInfo->ri_NumIndices > 0)
2692 ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
2693 ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
2694 ResetPerTupleExprContext(ec->estate);
2699 * update_hint_bits() -- update hint bits in destination pages
2701 * Scan all the pages that we moved tuples onto and update tuple status bits.
2702 * This is normally not really necessary, but it will save time for future
2703 * transactions examining these tuples.
2705 * This pass guarantees that all HEAP_MOVED_IN tuples are marked as
2706 * XMIN_COMMITTED, so that future tqual tests won't need to check their XVAC.
2708 * BUT NOTICE that this code fails to clear HEAP_MOVED_OFF tuples from
2709 * pages that were move source pages but not move dest pages. The bulk
2710 * of the move source pages will be physically truncated from the relation,
2711 * and the last page remaining in the rel will be fixed separately in
2712 * repair_frag(), so the only cases where a MOVED_OFF tuple won't get its
2713 * hint bits updated are tuples that are moved as part of a chain and were
2714 * on pages that were not either move destinations nor at the end of the rel.
2715 * To completely ensure that no MOVED_OFF tuples remain unmarked, we'd have
2716 * to remember and revisit those pages too.
2718 * Because of this omission, VACUUM FULL FREEZE is not a safe combination;
2719 * it's possible that the VACUUM's own XID remains exposed as something that
2720 * tqual tests would need to check.
2722 * For the non-freeze case, one wonders whether it wouldn't be better to skip
2723 * this work entirely, and let the tuple status updates happen someplace
2724 * that's not holding an exclusive lock on the relation.
2727 update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
2728 BlockNumber last_move_dest_block, int num_moved)
2730 TransactionId myXID = GetCurrentTransactionId();
2731 int checked_moved = 0;
2735 for (i = 0, curpage = fraged_pages->pagedesc;
2736 i < num_fraged_pages;
2741 OffsetNumber max_offset;
2745 vacuum_delay_point();
2747 if ((*curpage)->blkno > last_move_dest_block)
2748 break; /* no need to scan any further */
2749 if ((*curpage)->offsets_used == 0)
2750 continue; /* this page was never used as a move dest */
2751 buf = ReadBuffer(rel, (*curpage)->blkno);
2752 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2753 page = BufferGetPage(buf);
2754 max_offset = PageGetMaxOffsetNumber(page);
2755 for (off = FirstOffsetNumber;
2757 off = OffsetNumberNext(off))
2759 ItemId itemid = PageGetItemId(page, off);
2760 HeapTupleHeader htup;
2762 if (!ItemIdIsUsed(itemid))
2764 htup = (HeapTupleHeader) PageGetItem(page, itemid);
2765 if (htup->t_infomask & HEAP_XMIN_COMMITTED)
2769 * Here we may see either MOVED_OFF or MOVED_IN tuples.
2771 if (!(htup->t_infomask & HEAP_MOVED))
2772 elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2773 if (HeapTupleHeaderGetXvac(htup) != myXID)
2774 elog(ERROR, "invalid XVAC in tuple header");
2776 if (htup->t_infomask & HEAP_MOVED_IN)
2778 htup->t_infomask |= HEAP_XMIN_COMMITTED;
2779 htup->t_infomask &= ~HEAP_MOVED;
2783 htup->t_infomask |= HEAP_XMIN_INVALID;
2785 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2787 Assert((*curpage)->offsets_used == num_tuples);
2788 checked_moved += num_tuples;
2790 Assert(num_moved == checked_moved);
2794 * vacuum_heap() -- free dead tuples
2796 * This routine marks dead tuples as unused and truncates relation
2797 * if there are "empty" end-blocks.
2800 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
2804 BlockNumber relblocks;
2808 nblocks = vacuum_pages->num_pages;
2809 nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with them */
2811 for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2813 vacuum_delay_point();
2815 if ((*vacpage)->offsets_free > 0)
2817 buf = ReadBuffer(onerel, (*vacpage)->blkno);
2818 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2819 vacuum_page(onerel, buf, *vacpage);
2820 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2825 /* Truncate relation if there are some empty end-pages */
2826 Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2827 if (vacuum_pages->empty_end_pages > 0)
2829 relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2831 (errmsg("\"%s\": truncated %u to %u pages",
2832 RelationGetRelationName(onerel),
2833 vacrelstats->rel_pages, relblocks)));
2834 RelationTruncate(onerel, relblocks);
2835 vacrelstats->rel_pages = relblocks; /* set new number of blocks */
2840 * vacuum_page() -- free dead tuples on a page
2841 * and repair its fragmentation.
2844 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
2846 OffsetNumber unused[BLCKSZ / sizeof(OffsetNumber)];
2848 Page page = BufferGetPage(buffer);
2852 /* There shouldn't be any tuples moved onto the page yet! */
2853 Assert(vacpage->offsets_used == 0);
2855 START_CRIT_SECTION();
2857 for (i = 0; i < vacpage->offsets_free; i++)
2859 itemid = PageGetItemId(page, vacpage->offsets[i]);
2860 itemid->lp_flags &= ~LP_USED;
2863 uncnt = PageRepairFragmentation(page, unused);
2866 if (!onerel->rd_istemp)
2870 recptr = log_heap_clean(onerel, buffer, unused, uncnt);
2871 PageSetLSN(page, recptr);
2872 PageSetTLI(page, ThisTimeLineID);
2876 /* No XLOG record, but still need to flag that XID exists on disk */
2877 MyXactMadeTempRelUpdate = true;
2884 * scan_index() -- scan one index relation to update statistic.
2886 * We use this when we have no deletions to do.
2889 scan_index(Relation indrel, double num_tuples)
2891 IndexBulkDeleteResult *stats;
2892 IndexVacuumCleanupInfo vcinfo;
2895 vac_init_rusage(&ru0);
2898 * Even though we're not planning to delete anything, we use the
2899 * ambulkdelete call, because (a) the scan happens within the index AM
2900 * for more speed, and (b) it may want to pass private statistics to
2901 * the amvacuumcleanup call.
2903 stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2905 /* Do post-VACUUM cleanup, even though we deleted nothing */
2906 vcinfo.vacuum_full = true;
2907 vcinfo.message_level = elevel;
2909 stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
2914 /* now update statistics in pg_class */
2915 vac_update_relstats(RelationGetRelid(indrel),
2916 stats->num_pages, stats->num_index_tuples,
2920 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
2921 RelationGetRelationName(indrel),
2922 stats->num_index_tuples,
2924 errdetail("%u index pages have been deleted, %u are currently reusable.\n"
2926 stats->pages_deleted, stats->pages_free,
2927 vac_show_rusage(&ru0))));
2930 * Check for tuple count mismatch. If the index is partial, then it's
2931 * OK for it to have fewer tuples than the heap; else we got trouble.
2933 if (stats->num_index_tuples != num_tuples)
2935 if (stats->num_index_tuples > num_tuples ||
2936 !vac_is_partial_index(indrel))
2938 (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
2939 RelationGetRelationName(indrel),
2940 stats->num_index_tuples, num_tuples),
2941 errhint("Rebuild the index with REINDEX.")));
2948 * vacuum_index() -- vacuum one index relation.
2950 * Vpl is the VacPageList of the heap we're currently vacuuming.
2951 * It's locked. Indrel is an index relation on the vacuumed heap.
2953 * We don't bother to set locks on the index relation here, since
2954 * the parent table is exclusive-locked already.
2956 * Finally, we arrange to update the index relation's statistics in
2960 vacuum_index(VacPageList vacpagelist, Relation indrel,
2961 double num_tuples, int keep_tuples)
2963 IndexBulkDeleteResult *stats;
2964 IndexVacuumCleanupInfo vcinfo;
2967 vac_init_rusage(&ru0);
2969 /* Do bulk deletion */
2970 stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2972 /* Do post-VACUUM cleanup */
2973 vcinfo.vacuum_full = true;
2974 vcinfo.message_level = elevel;
2976 stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
2981 /* now update statistics in pg_class */
2982 vac_update_relstats(RelationGetRelid(indrel),
2983 stats->num_pages, stats->num_index_tuples,
2987 (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
2988 RelationGetRelationName(indrel),
2989 stats->num_index_tuples,
2991 errdetail("%.0f index row versions were removed.\n"
2992 "%u index pages have been deleted, %u are currently reusable.\n"
2994 stats->tuples_removed,
2995 stats->pages_deleted, stats->pages_free,
2996 vac_show_rusage(&ru0))));
2999 * Check for tuple count mismatch. If the index is partial, then it's
3000 * OK for it to have fewer tuples than the heap; else we got trouble.
3002 if (stats->num_index_tuples != num_tuples + keep_tuples)
3004 if (stats->num_index_tuples > num_tuples + keep_tuples ||
3005 !vac_is_partial_index(indrel))
3007 (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
3008 RelationGetRelationName(indrel),
3009 stats->num_index_tuples, num_tuples + keep_tuples),
3010 errhint("Rebuild the index with REINDEX.")));
3017 * tid_reaped() -- is a particular tid reaped?
3019 * This has the right signature to be an IndexBulkDeleteCallback.
3021 * vacpagelist->VacPage_array is sorted in right order.
3024 tid_reaped(ItemPointer itemptr, void *state)
3026 VacPageList vacpagelist = (VacPageList) state;
3027 OffsetNumber ioffno;
3031 VacPageData vacpage;
3033 vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
3034 ioffno = ItemPointerGetOffsetNumber(itemptr);
3037 vpp = (VacPage *) vac_bsearch((void *) &vp,
3038 (void *) (vacpagelist->pagedesc),
3039 vacpagelist->num_pages,
3046 /* ok - we are on a partially or fully reaped page */
3049 if (vp->offsets_free == 0)
3051 /* this is EmptyPage, so claim all tuples on it are reaped!!! */
3055 voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
3056 (void *) (vp->offsets),
3058 sizeof(OffsetNumber),
3069 * Dummy version for scan_index.
3072 dummy_tid_reaped(ItemPointer itemptr, void *state)
3078 * Update the shared Free Space Map with the info we now have about
3079 * free space in the relation, discarding any old info the map may have.
3082 vac_update_fsm(Relation onerel, VacPageList fraged_pages,
3083 BlockNumber rel_pages)
3085 int nPages = fraged_pages->num_pages;
3086 VacPage *pagedesc = fraged_pages->pagedesc;
3088 PageFreeSpaceInfo *pageSpaces;
3093 * We only report pages with free space at least equal to the average
3094 * request size --- this avoids cluttering FSM with uselessly-small
3095 * bits of space. Although FSM would discard pages with little free
3096 * space anyway, it's important to do this prefiltering because (a) it
3097 * reduces the time spent holding the FSM lock in
3098 * RecordRelationFreeSpace, and (b) FSM uses the number of pages
3099 * reported as a statistic for guiding space management. If we didn't
3100 * threshold our reports the same way vacuumlazy.c does, we'd be
3101 * skewing that statistic.
3103 threshold = GetAvgFSMRequestSize(&onerel->rd_node);
3105 pageSpaces = (PageFreeSpaceInfo *)
3106 palloc(nPages * sizeof(PageFreeSpaceInfo));
3109 for (i = 0; i < nPages; i++)
3112 * fraged_pages may contain entries for pages that we later
3113 * decided to truncate from the relation; don't enter them into
3114 * the free space map!
3116 if (pagedesc[i]->blkno >= rel_pages)
3119 if (pagedesc[i]->free >= threshold)
3121 pageSpaces[outPages].blkno = pagedesc[i]->blkno;
3122 pageSpaces[outPages].avail = pagedesc[i]->free;
3127 RecordRelationFreeSpace(&onerel->rd_node, outPages, pageSpaces);
3132 /* Copy a VacPage structure */
3134 copy_vac_page(VacPage vacpage)
3138 /* allocate a VacPageData entry */
3139 newvacpage = (VacPage) palloc(sizeof(VacPageData) +
3140 vacpage->offsets_free * sizeof(OffsetNumber));
3143 if (vacpage->offsets_free > 0)
3144 memcpy(newvacpage->offsets, vacpage->offsets,
3145 vacpage->offsets_free * sizeof(OffsetNumber));
3146 newvacpage->blkno = vacpage->blkno;
3147 newvacpage->free = vacpage->free;
3148 newvacpage->offsets_used = vacpage->offsets_used;
3149 newvacpage->offsets_free = vacpage->offsets_free;
3155 * Add a VacPage pointer to a VacPageList.
3157 * As a side effect of the way that scan_heap works,
3158 * higher pages come after lower pages in the array
3159 * (and highest tid on a page is last).
3162 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
3164 #define PG_NPAGEDESC 1024
3166 /* allocate a VacPage entry if needed */
3167 if (vacpagelist->num_pages == 0)
3169 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
3170 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
3172 else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
3174 vacpagelist->num_allocated_pages *= 2;
3175 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
3177 vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
3178 (vacpagelist->num_pages)++;
3182 * vac_bsearch: just like standard C library routine bsearch(),
3183 * except that we first test to see whether the target key is outside
3184 * the range of the table entries. This case is handled relatively slowly
3185 * by the normal binary search algorithm (ie, no faster than any other key)
3186 * but it occurs often enough in VACUUM to be worth optimizing.
3189 vac_bsearch(const void *key, const void *base,
3190 size_t nelem, size_t size,
3191 int (*compar) (const void *, const void *))
3198 res = compar(key, base);
3202 return (void *) base;
3205 last = (const void *) ((const char *) base + (nelem - 1) * size);
3206 res = compar(key, last);
3210 return (void *) last;
3213 return NULL; /* already checked 'em all */
3214 return bsearch(key, base, nelem, size, compar);
3218 * Comparator routines for use with qsort() and bsearch().
3221 vac_cmp_blk(const void *left, const void *right)
3226 lblk = (*((VacPage *) left))->blkno;
3227 rblk = (*((VacPage *) right))->blkno;
3237 vac_cmp_offno(const void *left, const void *right)
3239 if (*(OffsetNumber *) left < *(OffsetNumber *) right)
3241 if (*(OffsetNumber *) left == *(OffsetNumber *) right)
3247 vac_cmp_vtlinks(const void *left, const void *right)
3249 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
3250 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
3252 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
3253 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
3255 /* bi_hi-es are equal */
3256 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
3257 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
3259 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
3260 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
3262 /* bi_lo-es are equal */
3263 if (((VTupleLink) left)->new_tid.ip_posid <
3264 ((VTupleLink) right)->new_tid.ip_posid)
3266 if (((VTupleLink) left)->new_tid.ip_posid >
3267 ((VTupleLink) right)->new_tid.ip_posid)
3274 * Open all the indexes of the given relation, obtaining the specified kind
3275 * of lock on each. Return an array of Relation pointers for the indexes
3276 * into *Irel, and the number of indexes into *nindexes.
3279 vac_open_indexes(Relation relation, LOCKMODE lockmode,
3280 int *nindexes, Relation **Irel)
3283 ListCell *indexoidscan;
3286 indexoidlist = RelationGetIndexList(relation);
3288 *nindexes = list_length(indexoidlist);
3291 *Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
3296 foreach(indexoidscan, indexoidlist)
3298 Oid indexoid = lfirst_oid(indexoidscan);
3301 ind = index_open(indexoid);
3303 LockRelation(ind, lockmode);
3306 list_free(indexoidlist);
3310 * Release the resources acquired by vac_open_indexes. Optionally release
3311 * the locks (say NoLock to keep 'em).
3314 vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
3321 Relation ind = Irel[nindexes];
3323 if (lockmode != NoLock)
3324 UnlockRelation(ind, lockmode);
3332 * Is an index partial (ie, could it contain fewer tuples than the heap?)
3335 vac_is_partial_index(Relation indrel)
3338 * If the index's AM doesn't support nulls, it's partial for our
3341 if (!indrel->rd_am->amindexnulls)
3344 /* Otherwise, look to see if there's a partial-index predicate */
3345 if (!heap_attisnull(indrel->rd_indextuple, Anum_pg_index_indpred))
3353 enough_space(VacPage vacpage, Size len)
3355 len = MAXALIGN(len);
3357 if (len > vacpage->free)
3360 /* if there are free itemid(s) and len <= free_space... */
3361 if (vacpage->offsets_used < vacpage->offsets_free)
3364 /* noff_used >= noff_free and so we'll have to allocate new itemid */
3365 if (len + sizeof(ItemIdData) <= vacpage->free)
3373 * Initialize usage snapshot.
3376 vac_init_rusage(VacRUsage *ru0)
3380 getrusage(RUSAGE_SELF, &ru0->ru);
3381 gettimeofday(&ru0->tv, &tz);
3385 * Compute elapsed time since ru0 usage snapshot, and format into
3386 * a displayable string. Result is in a static string, which is
3387 * tacky, but no one ever claimed that the Postgres backend is
3391 vac_show_rusage(VacRUsage *ru0)
3393 static char result[100];
3396 vac_init_rusage(&ru1);
3398 if (ru1.tv.tv_usec < ru0->tv.tv_usec)
3401 ru1.tv.tv_usec += 1000000;
3403 if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
3405 ru1.ru.ru_stime.tv_sec--;
3406 ru1.ru.ru_stime.tv_usec += 1000000;
3408 if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
3410 ru1.ru.ru_utime.tv_sec--;
3411 ru1.ru.ru_utime.tv_usec += 1000000;
3414 snprintf(result, sizeof(result),
3415 "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
3416 (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
3417 (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
3418 (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
3419 (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
3420 (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
3421 (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);
3427 * vacuum_delay_point --- check for interrupts and cost-based delay.
3429 * This should be called in each major loop of VACUUM processing,
3430 * typically once per page processed.
3433 vacuum_delay_point(void)
3435 /* Always check for interrupts */
3436 CHECK_FOR_INTERRUPTS();
3438 /* Nap if appropriate */
3439 if (VacuumCostActive && !InterruptPending &&
3440 VacuumCostBalance >= VacuumCostLimit)
3444 msec = VacuumCostDelay * VacuumCostBalance / VacuumCostLimit;
3445 if (msec > VacuumCostDelay * 4)
3446 msec = VacuumCostDelay * 4;
3448 pg_usleep(msec * 1000L);
3450 VacuumCostBalance = 0;
3452 /* Might have gotten an interrupt while sleeping */
3453 CHECK_FOR_INTERRUPTS();