1 /*-------------------------------------------------------------------------
4 * The postgres vacuum cleaner.
6 * This file includes the "full" version of VACUUM, as well as control code
7 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE. See
8 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
11 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
16 * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.233 2002/08/06 02:36:34 tgl Exp $
18 *-------------------------------------------------------------------------
24 #include "access/clog.h"
25 #include "access/genam.h"
26 #include "access/heapam.h"
27 #include "access/xlog.h"
28 #include "catalog/catalog.h"
29 #include "catalog/catname.h"
30 #include "catalog/namespace.h"
31 #include "catalog/pg_database.h"
32 #include "catalog/pg_index.h"
33 #include "commands/vacuum.h"
34 #include "executor/executor.h"
35 #include "miscadmin.h"
36 #include "storage/freespace.h"
37 #include "storage/sinval.h"
38 #include "storage/smgr.h"
39 #include "tcop/pquery.h"
40 #include "utils/acl.h"
41 #include "utils/builtins.h"
42 #include "utils/fmgroids.h"
43 #include "utils/inval.h"
44 #include "utils/lsyscache.h"
45 #include "utils/relcache.h"
46 #include "utils/syscache.h"
50 typedef struct VacPageData
52 BlockNumber blkno; /* BlockNumber of this Page */
53 Size free; /* FreeSpace on this Page */
54 uint16 offsets_used; /* Number of OffNums used by vacuum */
55 uint16 offsets_free; /* Number of OffNums free or to be free */
56 OffsetNumber offsets[1]; /* Array of free OffNums */
59 typedef VacPageData *VacPage;
61 typedef struct VacPageListData
63 BlockNumber empty_end_pages; /* Number of "empty" end-pages */
64 int num_pages; /* Number of pages in pagedesc */
65 int num_allocated_pages; /* Number of allocated pages in
67 VacPage *pagedesc; /* Descriptions of pages */
70 typedef VacPageListData *VacPageList;
72 typedef struct VTupleLinkData
74 ItemPointerData new_tid;
75 ItemPointerData this_tid;
78 typedef VTupleLinkData *VTupleLink;
80 typedef struct VTupleMoveData
82 ItemPointerData tid; /* tuple ID */
83 VacPage vacpage; /* where to move */
84 bool cleanVpd; /* clean vacpage before using */
87 typedef VTupleMoveData *VTupleMove;
89 typedef struct VRelStats
91 BlockNumber rel_pages;
101 static MemoryContext vac_context = NULL;
103 static int elevel = -1;
105 static TransactionId OldestXmin;
106 static TransactionId FreezeLimit;
108 static TransactionId initialOldestXmin;
109 static TransactionId initialFreezeLimit;
112 /* non-export function prototypes */
113 static List *getrels(const RangeVar *vacrel, const char *stmttype);
114 static void vac_update_dbstats(Oid dbid,
115 TransactionId vacuumXID,
116 TransactionId frozenXID);
117 static void vac_truncate_clog(TransactionId vacuumXID,
118 TransactionId frozenXID);
119 static void vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
120 static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
121 static void scan_heap(VRelStats *vacrelstats, Relation onerel,
122 VacPageList vacuum_pages, VacPageList fraged_pages);
123 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
124 VacPageList vacuum_pages, VacPageList fraged_pages,
125 int nindexes, Relation *Irel);
126 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
127 VacPageList vacpagelist);
128 static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
129 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
130 double num_tuples, int keep_tuples);
131 static void scan_index(Relation indrel, double num_tuples);
132 static bool tid_reaped(ItemPointer itemptr, void *state);
133 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
134 static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
135 BlockNumber rel_pages);
136 static VacPage copy_vac_page(VacPage vacpage);
137 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
138 static void *vac_bsearch(const void *key, const void *base,
139 size_t nelem, size_t size,
140 int (*compar) (const void *, const void *));
141 static int vac_cmp_blk(const void *left, const void *right);
142 static int vac_cmp_offno(const void *left, const void *right);
143 static int vac_cmp_vtlinks(const void *left, const void *right);
144 static bool enough_space(VacPage vacpage, Size len);
147 /****************************************************************************
149 * Code common to all flavors of VACUUM and ANALYZE *
151 ****************************************************************************
156 * Primary entry point for VACUUM and ANALYZE commands.
159 vacuum(VacuumStmt *vacstmt)
161 const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
162 MemoryContext anl_context = NULL;
166 if (vacstmt->verbose)
172 * We cannot run VACUUM inside a user transaction block; if we were
173 * inside a transaction, then our commit- and
174 * start-transaction-command calls would not have the intended effect!
175 * Furthermore, the forced commit that occurs before truncating the
176 * relation's file would have the effect of committing the rest of the
177 * user's transaction too, which would certainly not be the desired
180 if (vacstmt->vacuum && IsTransactionBlock())
181 elog(ERROR, "%s cannot run inside a BEGIN/END block", stmttype);
183 /* Running VACUUM from a function would free the function context */
184 if (vacstmt->vacuum && !MemoryContextContains(QueryContext, vacstmt))
185 elog(ERROR, "%s cannot be executed from a function", stmttype);
188 * Send info about dead objects to the statistics collector
191 pgstat_vacuum_tabstat();
194 * Create special memory context for cross-transaction storage.
196 * Since it is a child of QueryContext, it will go away eventually even
197 * if we suffer an error; there's no need for special abort cleanup
200 vac_context = AllocSetContextCreate(QueryContext,
202 ALLOCSET_DEFAULT_MINSIZE,
203 ALLOCSET_DEFAULT_INITSIZE,
204 ALLOCSET_DEFAULT_MAXSIZE);
207 * If we are running only ANALYZE, we don't need per-table transactions,
208 * but we still need a memory context with table lifetime.
210 if (vacstmt->analyze && !vacstmt->vacuum)
211 anl_context = AllocSetContextCreate(QueryContext,
213 ALLOCSET_DEFAULT_MINSIZE,
214 ALLOCSET_DEFAULT_INITSIZE,
215 ALLOCSET_DEFAULT_MAXSIZE);
217 /* Build list of relations to process (note this lives in vac_context) */
218 vrl = getrels(vacstmt->relation, stmttype);
221 * Formerly, there was code here to prevent more than one VACUUM from
222 * executing concurrently in the same database. However, there's no
223 * good reason to prevent that, and manually removing lockfiles after
224 * a vacuum crash was a pain for dbadmins. So, forget about lockfiles,
225 * and just rely on the locks we grab on each target table
226 * to ensure that there aren't two VACUUMs running on the same table
231 * The strangeness with committing and starting transactions here is due
232 * to wanting to run each table's VACUUM as a separate transaction, so
233 * that we don't hold locks unnecessarily long. Also, if we are doing
234 * VACUUM ANALYZE, the ANALYZE part runs as a separate transaction from
235 * the VACUUM to further reduce locking.
237 * vacuum_rel expects to be entered with no transaction active; it will
238 * start and commit its own transaction. But we are called by an SQL
239 * command, and so we are executing inside a transaction already. We
240 * commit the transaction started in PostgresMain() here, and start
241 * another one before exiting to match the commit waiting for us back in
244 * In the case of an ANALYZE statement (no vacuum, just analyze) it's
245 * okay to run the whole thing in the outer transaction, and so we skip
246 * transaction start/stop operations.
250 if (vacstmt->relation == NULL)
253 * It's a database-wide VACUUM.
255 * Compute the initially applicable OldestXmin and FreezeLimit
256 * XIDs, so that we can record these values at the end of the
257 * VACUUM. Note that individual tables may well be processed with
258 * newer values, but we can guarantee that no (non-shared)
259 * relations are processed with older ones.
261 * It is okay to record non-shared values in pg_database, even though
262 * we may vacuum shared relations with older cutoffs, because only
263 * the minimum of the values present in pg_database matters. We
264 * can be sure that shared relations have at some time been
265 * vacuumed with cutoffs no worse than the global minimum; for, if
266 * there is a backend in some other DB with xmin = OLDXMIN that's
267 * determining the cutoff with which we vacuum shared relations,
268 * it is not possible for that database to have a cutoff newer
269 * than OLDXMIN recorded in pg_database.
271 vacuum_set_xid_limits(vacstmt, false,
272 &initialOldestXmin, &initialFreezeLimit);
275 /* matches the StartTransaction in PostgresMain() */
276 CommitTransactionCommand();
280 * Loop to process each selected relation.
284 Oid relid = (Oid) lfirsti(cur);
287 vacuum_rel(relid, vacstmt, RELKIND_RELATION);
288 if (vacstmt->analyze)
290 MemoryContext old_context = NULL;
293 * If we vacuumed, use new transaction for analyze. Otherwise,
294 * we can use the outer transaction, but we still need to call
295 * analyze_rel in a memory context that will be cleaned up on
296 * return (else we leak memory while processing multiple tables).
299 StartTransactionCommand();
301 old_context = MemoryContextSwitchTo(anl_context);
303 analyze_rel(relid, vacstmt);
306 CommitTransactionCommand();
309 MemoryContextSwitchTo(old_context);
310 MemoryContextResetAndDeleteChildren(anl_context);
316 * Finish up processing.
320 /* here, we are not in a transaction */
322 /* matches the CommitTransaction in PostgresMain() */
323 StartTransactionCommand();
326 * If we did a database-wide VACUUM, update the database's pg_database
327 * row with info about the transaction IDs used, and try to truncate
330 if (vacstmt->relation == NULL)
332 vac_update_dbstats(MyDatabaseId,
333 initialOldestXmin, initialFreezeLimit);
334 vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
339 * Clean up working storage --- note we must do this after
340 * StartTransactionCommand, else we might be trying to delete the
343 MemoryContextDelete(vac_context);
347 MemoryContextDelete(anl_context);
351 * Build a list of Oids for each relation to be processed
353 * The list is built in vac_context so that it will survive across our
354 * per-relation transactions.
357 getrels(const RangeVar *vacrel, const char *stmttype)
360 MemoryContext oldcontext;
364 /* Process specific relation */
367 relid = RangeVarGetRelid(vacrel, false);
369 /* Make a relation list entry for this guy */
370 oldcontext = MemoryContextSwitchTo(vac_context);
371 vrl = lappendi(vrl, relid);
372 MemoryContextSwitchTo(oldcontext);
376 /* Process all plain relations listed in pg_class */
382 ScanKeyEntryInitialize(&key, 0x0,
383 Anum_pg_class_relkind,
385 CharGetDatum(RELKIND_RELATION));
387 pgclass = heap_openr(RelationRelationName, AccessShareLock);
389 scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
391 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
393 /* Make a relation list entry for this guy */
394 oldcontext = MemoryContextSwitchTo(vac_context);
395 AssertTupleDescHasOid(pgclass->rd_att);
396 vrl = lappendi(vrl, HeapTupleGetOid(tuple));
397 MemoryContextSwitchTo(oldcontext);
401 heap_close(pgclass, AccessShareLock);
408 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
411 vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
412 TransactionId *oldestXmin,
413 TransactionId *freezeLimit)
417 *oldestXmin = GetOldestXmin(sharedRel);
419 Assert(TransactionIdIsNormal(*oldestXmin));
423 /* FREEZE option: use oldest Xmin as freeze cutoff too */
429 * Normal case: freeze cutoff is well in the past, to wit, about
430 * halfway to the wrap horizon
432 limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
436 * Be careful not to generate a "permanent" XID
438 if (!TransactionIdIsNormal(limit))
439 limit = FirstNormalTransactionId;
442 * Ensure sane relationship of limits
444 if (TransactionIdFollows(limit, *oldestXmin))
446 elog(WARNING, "oldest Xmin is far in the past --- close open transactions soon to avoid wraparound problems");
450 *freezeLimit = limit;
455 * vac_update_relstats() -- update statistics for one relation
457 * Update the whole-relation statistics that are kept in its pg_class
458 * row. There are additional stats that will be updated if we are
459 * doing ANALYZE, but we always update these stats. This routine works
460 * for both index and heap relation entries in pg_class.
462 * We violate no-overwrite semantics here by storing new values for the
463 * statistics columns directly into the pg_class tuple that's already on
464 * the page. The reason for this is that if we updated these tuples in
465 * the usual way, vacuuming pg_class itself wouldn't work very well ---
466 * by the time we got done with a vacuum cycle, most of the tuples in
467 * pg_class would've been obsoleted. Of course, this only works for
468 * fixed-size never-null columns, but these are.
470 * This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
474 vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
480 Form_pg_class pgcform;
484 * update number of tuples and number of pages in pg_class
486 rd = heap_openr(RelationRelationName, RowExclusiveLock);
488 ctup = SearchSysCache(RELOID,
489 ObjectIdGetDatum(relid),
491 if (!HeapTupleIsValid(ctup))
492 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
495 /* get the buffer cache tuple */
496 rtup.t_self = ctup->t_self;
497 ReleaseSysCache(ctup);
498 if (!heap_fetch(rd, SnapshotNow, &rtup, &buffer, false, NULL))
499 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
502 /* overwrite the existing statistics in the tuple */
503 pgcform = (Form_pg_class) GETSTRUCT(&rtup);
504 pgcform->relpages = (int32) num_pages;
505 pgcform->reltuples = num_tuples;
506 pgcform->relhasindex = hasindex;
509 * If we have discovered that there are no indexes, then there's no
510 * primary key either. This could be done more thoroughly...
513 pgcform->relhaspkey = false;
516 * Invalidate the tuple in the catcaches; this also arranges to flush
517 * the relation's relcache entry. (If we fail to commit for some reason,
518 * no flush will occur, but no great harm is done since there are no
519 * noncritical state updates here.)
521 CacheInvalidateHeapTuple(rd, &rtup);
523 /* Write the buffer */
526 heap_close(rd, RowExclusiveLock);
531 * vac_update_dbstats() -- update statistics for one database
533 * Update the whole-database statistics that are kept in its pg_database
536 * We violate no-overwrite semantics here by storing new values for the
537 * statistics columns directly into the tuple that's already on the page.
538 * As with vac_update_relstats, this avoids leaving dead tuples behind
539 * after a VACUUM; which is good since GetRawDatabaseInfo
540 * can get confused by finding dead tuples in pg_database.
542 * This routine is shared by full and lazy VACUUM. Note that it is only
543 * applied after a database-wide VACUUM operation.
546 vac_update_dbstats(Oid dbid,
547 TransactionId vacuumXID,
548 TransactionId frozenXID)
551 ScanKeyData entry[1];
554 Form_pg_database dbform;
556 relation = heap_openr(DatabaseRelationName, RowExclusiveLock);
558 /* Must use a heap scan, since there's no syscache for pg_database */
559 ScanKeyEntryInitialize(&entry[0], 0x0,
560 ObjectIdAttributeNumber, F_OIDEQ,
561 ObjectIdGetDatum(dbid));
563 scan = heap_beginscan(relation, SnapshotNow, 1, entry);
565 tuple = heap_getnext(scan, ForwardScanDirection);
567 if (!HeapTupleIsValid(tuple))
568 elog(ERROR, "database %u does not exist", dbid);
570 dbform = (Form_pg_database) GETSTRUCT(tuple);
572 /* overwrite the existing statistics in the tuple */
573 dbform->datvacuumxid = vacuumXID;
574 dbform->datfrozenxid = frozenXID;
576 /* invalidate the tuple in the cache and write the buffer */
577 CacheInvalidateHeapTuple(relation, tuple);
578 WriteNoReleaseBuffer(scan->rs_cbuf);
582 heap_close(relation, RowExclusiveLock);
587 * vac_truncate_clog() -- attempt to truncate the commit log
589 * Scan pg_database to determine the system-wide oldest datvacuumxid,
590 * and use it to truncate the transaction commit log (pg_clog).
591 * Also generate a warning if the system-wide oldest datfrozenxid
592 * seems to be in danger of wrapping around.
594 * The passed XIDs are simply the ones I just wrote into my pg_database
595 * entry. They're used to initialize the "min" calculations.
597 * This routine is shared by full and lazy VACUUM. Note that it is only
598 * applied after a database-wide VACUUM operation.
601 vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
608 bool vacuumAlreadyWrapped = false;
609 bool frozenAlreadyWrapped = false;
611 myXID = GetCurrentTransactionId();
613 relation = heap_openr(DatabaseRelationName, AccessShareLock);
615 scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
617 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
619 Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
621 /* Ignore non-connectable databases (eg, template0) */
622 /* It's assumed that these have been frozen correctly */
623 if (!dbform->datallowconn)
626 if (TransactionIdIsNormal(dbform->datvacuumxid))
628 if (TransactionIdPrecedes(myXID, dbform->datvacuumxid))
629 vacuumAlreadyWrapped = true;
630 else if (TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
631 vacuumXID = dbform->datvacuumxid;
633 if (TransactionIdIsNormal(dbform->datfrozenxid))
635 if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
636 frozenAlreadyWrapped = true;
637 else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
638 frozenXID = dbform->datfrozenxid;
644 heap_close(relation, AccessShareLock);
647 * Do not truncate CLOG if we seem to have suffered wraparound already;
648 * the computed minimum XID might be bogus.
650 if (vacuumAlreadyWrapped)
652 elog(WARNING, "Some databases have not been vacuumed in over 2 billion transactions."
653 "\n\tYou may have already suffered transaction-wraparound data loss.");
657 /* Truncate CLOG to the oldest vacuumxid */
658 TruncateCLOG(vacuumXID);
660 /* Give warning about impending wraparound problems */
661 if (frozenAlreadyWrapped)
663 elog(WARNING, "Some databases have not been vacuumed in over 1 billion transactions."
664 "\n\tBetter vacuum them soon, or you may have a wraparound failure.");
668 age = (int32) (myXID - frozenXID);
669 if (age > (int32) ((MaxTransactionId >> 3) * 3))
670 elog(WARNING, "Some databases have not been vacuumed in %d transactions."
671 "\n\tBetter vacuum them within %d transactions,"
672 "\n\tor you may have a wraparound failure.",
673 age, (int32) (MaxTransactionId >> 1) - age);
678 /****************************************************************************
680 * Code common to both flavors of VACUUM *
682 ****************************************************************************
687 * vacuum_rel() -- vacuum one heap relation
689 * Doing one heap at a time incurs extra overhead, since we need to
690 * check that the heap exists again just before we vacuum it. The
691 * reason that we do this is so that vacuuming can be spread across
692 * many small transactions. Otherwise, two-phase locking would require
693 * us to lock the entire database during one pass of the vacuum cleaner.
695 * At entry and exit, we are not inside a transaction.
698 vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
705 /* Begin a transaction for vacuuming this relation */
706 StartTransactionCommand();
709 * Check for user-requested abort. Note we want this to be inside a
710 * transaction, so xact.c doesn't issue useless WARNING.
712 CHECK_FOR_INTERRUPTS();
715 * Race condition -- if the pg_class tuple has gone away since the
716 * last time we saw it, we don't need to vacuum it.
718 if (!SearchSysCacheExists(RELOID,
719 ObjectIdGetDatum(relid),
722 CommitTransactionCommand();
727 * Determine the type of lock we want --- hard exclusive lock for a
728 * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
729 * vacuum. Either way, we can be sure that no other backend is
730 * vacuuming the same table.
732 lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;
735 * Open the class, get an appropriate lock on it, and check
738 * We allow the user to vacuum a table if he is superuser, the table
739 * owner, or the database owner (but in the latter case, only if it's
740 * not a shared relation). pg_class_ownercheck includes the superuser case.
742 * Note we choose to treat permissions failure as a WARNING and keep
743 * trying to vacuum the rest of the DB --- is this appropriate?
745 onerel = relation_open(relid, lmode);
747 if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
748 (is_dbadmin(MyDatabaseId) && !onerel->rd_rel->relisshared)))
750 elog(WARNING, "Skipping \"%s\" --- only table or database owner can VACUUM it",
751 RelationGetRelationName(onerel));
752 relation_close(onerel, lmode);
753 CommitTransactionCommand();
758 * Check that it's a plain table; we used to do this in getrels() but
759 * seems safer to check after we've locked the relation.
761 if (onerel->rd_rel->relkind != expected_relkind)
763 elog(WARNING, "Skipping \"%s\" --- can not process indexes, views or special system tables",
764 RelationGetRelationName(onerel));
765 relation_close(onerel, lmode);
766 CommitTransactionCommand();
771 * Get a session-level lock too. This will protect our access to the
772 * relation across multiple transactions, so that we can vacuum the
773 * relation's TOAST table (if any) secure in the knowledge that no one
774 * is deleting the parent relation.
776 * NOTE: this cannot block, even if someone else is waiting for access,
777 * because the lock manager knows that both lock requests are from the
780 onerelid = onerel->rd_lockInfo.lockRelId;
781 LockRelationForSession(&onerelid, lmode);
784 * Remember the relation's TOAST relation for later
786 toast_relid = onerel->rd_rel->reltoastrelid;
789 * Do the actual work --- either FULL or "lazy" vacuum
792 full_vacuum_rel(onerel, vacstmt);
794 lazy_vacuum_rel(onerel, vacstmt);
796 /* all done with this class, but hold lock until commit */
797 relation_close(onerel, NoLock);
800 * Complete the transaction and free all temporary memory used.
802 CommitTransactionCommand();
805 * If the relation has a secondary toast rel, vacuum that too while we
806 * still hold the session lock on the master table. Note however that
807 * "analyze" will not get done on the toast table. This is good,
808 * because the toaster always uses hardcoded index access and
809 * statistics are totally unimportant for toast relations.
811 if (toast_relid != InvalidOid)
812 vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE);
815 * Now release the session-level lock on the master table.
817 UnlockRelationForSession(&onerelid, lmode);
821 /****************************************************************************
823 * Code for VACUUM FULL (only) *
825 ****************************************************************************
830 * full_vacuum_rel() -- perform FULL VACUUM for one heap relation
832 * This routine vacuums a single heap, cleans out its indexes, and
833 * updates its num_pages and num_tuples statistics.
835 * At entry, we have already established a transaction and opened
836 * and locked the relation.
839 full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
841 VacPageListData vacuum_pages; /* List of pages to vacuum and/or
843 VacPageListData fraged_pages; /* List of pages with space enough
848 VRelStats *vacrelstats;
849 bool reindex = false;
851 if (IsIgnoringSystemIndexes() &&
852 IsSystemRelation(onerel))
855 vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
856 &OldestXmin, &FreezeLimit);
859 * Set up statistics-gathering machinery.
861 vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
862 vacrelstats->rel_pages = 0;
863 vacrelstats->rel_tuples = 0;
864 vacrelstats->hasindex = false;
867 vacuum_pages.num_pages = fraged_pages.num_pages = 0;
868 scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
870 /* Now open all indexes of the relation */
871 vac_open_indexes(onerel, &nindexes, &Irel);
874 else if (!RelationGetForm(onerel)->relhasindex)
877 vacrelstats->hasindex = true;
882 * reindex in VACUUM is dangerous under WAL. ifdef out until it
887 vac_close_indexes(nindexes, Irel);
888 Irel = (Relation *) NULL;
889 activate_indexes_of_a_table(RelationGetRelid(onerel), false);
891 #endif /* NOT_USED */
893 /* Clean/scan index relation(s) */
894 if (Irel != (Relation *) NULL)
896 if (vacuum_pages.num_pages > 0)
898 for (i = 0; i < nindexes; i++)
899 vacuum_index(&vacuum_pages, Irel[i],
900 vacrelstats->rel_tuples, 0);
904 /* just scan indexes to update statistic */
905 for (i = 0; i < nindexes; i++)
906 scan_index(Irel[i], vacrelstats->rel_tuples);
910 if (fraged_pages.num_pages > 0)
912 /* Try to shrink heap */
913 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
915 vac_close_indexes(nindexes, Irel);
919 vac_close_indexes(nindexes, Irel);
920 if (vacuum_pages.num_pages > 0)
922 /* Clean pages from vacuum_pages list */
923 vacuum_heap(vacrelstats, onerel, &vacuum_pages);
928 * Flush dirty pages out to disk. We must do this even if we
929 * didn't do anything else, because we want to ensure that all
930 * tuples have correct on-row commit status on disk (see
931 * bufmgr.c's comments for FlushRelationBuffers()).
933 i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
935 elog(ERROR, "VACUUM (full_vacuum_rel): FlushRelationBuffers returned %d",
942 activate_indexes_of_a_table(RelationGetRelid(onerel), true);
943 #endif /* NOT_USED */
945 /* update shared free space map with final free space info */
946 vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
948 /* update statistics in pg_class */
949 vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
950 vacrelstats->rel_tuples, vacrelstats->hasindex);
955 * scan_heap() -- scan an open heap relation
957 * This routine sets commit status bits, constructs vacuum_pages (list
958 * of pages we need to compact free space on and/or clean indexes of
959 * deleted tuples), constructs fraged_pages (list of pages with free
960 * space that tuples could be moved into), and calculates statistics
961 * on the number of live tuples in the heap.
964 scan_heap(VRelStats *vacrelstats, Relation onerel,
965 VacPageList vacuum_pages, VacPageList fraged_pages)
980 BlockNumber empty_pages,
990 Size min_tlen = MaxTupleSize;
993 bool do_shrinking = true;
994 VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
996 int free_vtlinks = 100;
999 vac_init_rusage(&ru0);
1001 relname = RelationGetRelationName(onerel);
1002 elog(elevel, "--Relation %s.%s--",
1003 get_namespace_name(RelationGetNamespace(onerel)),
1006 empty_pages = new_pages = changed_pages = empty_end_pages = 0;
1007 num_tuples = tups_vacuumed = nkeep = nunused = 0;
1010 nblocks = RelationGetNumberOfBlocks(onerel);
1013 * We initially create each VacPage item in a maximal-sized workspace,
1014 * then copy the workspace into a just-large-enough copy.
1016 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1018 for (blkno = 0; blkno < nblocks; blkno++)
1025 CHECK_FOR_INTERRUPTS();
1027 buf = ReadBuffer(onerel, blkno);
1028 page = BufferGetPage(buf);
1030 vacpage->blkno = blkno;
1031 vacpage->offsets_used = 0;
1032 vacpage->offsets_free = 0;
1034 if (PageIsNew(page))
1036 elog(WARNING, "Rel %s: Uninitialized page %u - fixing",
1038 PageInit(page, BufferGetPageSize(buf), 0);
1039 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1040 free_size += (vacpage->free - sizeof(ItemIdData));
1043 vacpagecopy = copy_vac_page(vacpage);
1044 vpage_insert(vacuum_pages, vacpagecopy);
1045 vpage_insert(fraged_pages, vacpagecopy);
1050 if (PageIsEmpty(page))
1052 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1053 free_size += (vacpage->free - sizeof(ItemIdData));
1056 vacpagecopy = copy_vac_page(vacpage);
1057 vpage_insert(vacuum_pages, vacpagecopy);
1058 vpage_insert(fraged_pages, vacpagecopy);
1065 maxoff = PageGetMaxOffsetNumber(page);
1066 for (offnum = FirstOffsetNumber;
1068 offnum = OffsetNumberNext(offnum))
1072 itemid = PageGetItemId(page, offnum);
1075 * Collect un-used items too - it's possible to have indexes
1076 * pointing here after crash.
1078 if (!ItemIdIsUsed(itemid))
1080 vacpage->offsets[vacpage->offsets_free++] = offnum;
1085 tuple.t_datamcxt = NULL;
1086 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1087 tuple.t_len = ItemIdGetLength(itemid);
1088 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1091 sv_infomask = tuple.t_data->t_infomask;
1093 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin))
1095 case HEAPTUPLE_DEAD:
1096 tupgone = true; /* we can delete the tuple */
1098 case HEAPTUPLE_LIVE:
1101 * Tuple is good. Consider whether to replace its
1102 * xmin value with FrozenTransactionId.
1104 if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
1105 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1108 HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
1109 /* infomask should be okay already */
1110 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1114 case HEAPTUPLE_RECENTLY_DEAD:
1117 * If tuple is recently deleted then we must not
1118 * remove it from relation.
1123 * If we do shrinking and this tuple is updated one
1124 * then remember it to construct updated tuple
1128 !(ItemPointerEquals(&(tuple.t_self),
1129 &(tuple.t_data->t_ctid))))
1131 if (free_vtlinks == 0)
1133 free_vtlinks = 1000;
1134 vtlinks = (VTupleLink) repalloc(vtlinks,
1135 (free_vtlinks + num_vtlinks) *
1136 sizeof(VTupleLinkData));
1138 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
1139 vtlinks[num_vtlinks].this_tid = tuple.t_self;
1144 case HEAPTUPLE_INSERT_IN_PROGRESS:
1147 * This should not happen, since we hold exclusive
1148 * lock on the relation; shouldn't we raise an error?
1150 elog(WARNING, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
1151 relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data));
1152 do_shrinking = false;
1154 case HEAPTUPLE_DELETE_IN_PROGRESS:
1157 * This should not happen, since we hold exclusive
1158 * lock on the relation; shouldn't we raise an error?
1160 elog(WARNING, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
1161 relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data));
1162 do_shrinking = false;
1165 elog(ERROR, "Unexpected HeapTupleSatisfiesVacuum result");
1169 /* check for hint-bit update by HeapTupleSatisfiesVacuum */
1170 if (sv_infomask != tuple.t_data->t_infomask)
1176 if (onerel->rd_rel->relhasoids &&
1177 !OidIsValid(HeapTupleGetOid(&tuple)))
1178 elog(WARNING, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
1179 relname, blkno, offnum, (int) tupgone);
1186 * Here we are building a temporary copy of the page with
1187 * dead tuples removed. Below we will apply
1188 * PageRepairFragmentation to the copy, so that we can
1189 * determine how much space will be available after
1190 * removal of dead tuples. But note we are NOT changing
1191 * the real page yet...
1193 if (tempPage == (Page) NULL)
1197 pageSize = PageGetPageSize(page);
1198 tempPage = (Page) palloc(pageSize);
1199 memcpy(tempPage, page, pageSize);
1202 /* mark it unused on the temp page */
1203 lpp = PageGetItemId(tempPage, offnum);
1204 lpp->lp_flags &= ~LP_USED;
1206 vacpage->offsets[vacpage->offsets_free++] = offnum;
1213 if (tuple.t_len < min_tlen)
1214 min_tlen = tuple.t_len;
1215 if (tuple.t_len > max_tlen)
1216 max_tlen = tuple.t_len;
1218 } /* scan along page */
1220 if (tempPage != (Page) NULL)
1222 /* Some tuples are removable; figure free space after removal */
1223 PageRepairFragmentation(tempPage, NULL);
1224 vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1230 /* Just use current available space */
1231 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1232 /* Need to reap the page if it has ~LP_USED line pointers */
1233 do_reap = (vacpage->offsets_free > 0);
1236 free_size += vacpage->free;
1239 * Add the page to fraged_pages if it has a useful amount of free
1240 * space. "Useful" means enough for a minimal-sized tuple. But we
1241 * don't know that accurately near the start of the relation, so
1242 * add pages unconditionally if they have >= BLCKSZ/10 free space.
1244 do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1246 if (do_reap || do_frag)
1248 vacpagecopy = copy_vac_page(vacpage);
1250 vpage_insert(vacuum_pages, vacpagecopy);
1252 vpage_insert(fraged_pages, vacpagecopy);
1258 empty_end_pages = 0;
1271 /* save stats in the rel list for use later */
1272 vacrelstats->rel_tuples = num_tuples;
1273 vacrelstats->rel_pages = nblocks;
1274 if (num_tuples == 0)
1275 min_tlen = max_tlen = 0;
1276 vacrelstats->min_tlen = min_tlen;
1277 vacrelstats->max_tlen = max_tlen;
1279 vacuum_pages->empty_end_pages = empty_end_pages;
1280 fraged_pages->empty_end_pages = empty_end_pages;
1283 * Clear the fraged_pages list if we found we couldn't shrink. Else,
1284 * remove any "empty" end-pages from the list, and compute usable free
1285 * space = free space in remaining pages.
1289 Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
1290 fraged_pages->num_pages -= empty_end_pages;
1291 usable_free_size = 0;
1292 for (i = 0; i < fraged_pages->num_pages; i++)
1293 usable_free_size += fraged_pages->pagedesc[i]->free;
1297 fraged_pages->num_pages = 0;
1298 usable_free_size = 0;
1301 if (usable_free_size > 0 && num_vtlinks > 0)
1303 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
1305 vacrelstats->vtlinks = vtlinks;
1306 vacrelstats->num_vtlinks = num_vtlinks;
1310 vacrelstats->vtlinks = NULL;
1311 vacrelstats->num_vtlinks = 0;
1315 elog(elevel, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
1316 Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, UnUsed %.0f, MinLen %lu, MaxLen %lu; \
1317 Re-using: Free/Avail. Space %.0f/%.0f; EndEmpty/Avail. Pages %u/%u.\n\t%s",
1318 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
1319 new_pages, num_tuples, tups_vacuumed,
1320 nkeep, vacrelstats->num_vtlinks,
1321 nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1322 free_size, usable_free_size,
1323 empty_end_pages, fraged_pages->num_pages,
1324 vac_show_rusage(&ru0));
1330 * repair_frag() -- try to repair relation's fragmentation
1332 * This routine marks dead tuples as unused and tries re-use dead space
1333 * by moving tuples (and inserting indexes if needed). It constructs
1334 * Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1335 * for them after committing (in hack-manner - without losing locks
1336 * and freeing memory!) current transaction. It truncates relation
1337 * if some end-blocks are gone away.
1340 repair_frag(VRelStats *vacrelstats, Relation onerel,
1341 VacPageList vacuum_pages, VacPageList fraged_pages,
1342 int nindexes, Relation *Irel)
1344 TransactionId myXID;
1348 BlockNumber nblocks,
1350 BlockNumber last_move_dest_block = 0,
1354 OffsetNumber offnum,
1360 HeapTupleData tuple,
1363 ResultRelInfo *resultRelInfo;
1365 TupleTable tupleTable;
1366 TupleTableSlot *slot;
1367 VacPageListData Nvacpagelist;
1368 VacPage cur_page = NULL,
1386 vac_init_rusage(&ru0);
1388 myXID = GetCurrentTransactionId();
1389 myCID = GetCurrentCommandId();
1391 tupdesc = RelationGetDescr(onerel);
1394 * We need a ResultRelInfo and an EState so we can use the regular
1395 * executor's index-entry-making machinery.
1397 resultRelInfo = makeNode(ResultRelInfo);
1398 resultRelInfo->ri_RangeTableIndex = 1; /* dummy */
1399 resultRelInfo->ri_RelationDesc = onerel;
1400 resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */
1402 ExecOpenIndices(resultRelInfo);
1404 estate = CreateExecutorState();
1405 estate->es_result_relations = resultRelInfo;
1406 estate->es_num_result_relations = 1;
1407 estate->es_result_relation_info = resultRelInfo;
1409 /* Set up a dummy tuple table too */
1410 tupleTable = ExecCreateTupleTable(1);
1411 slot = ExecAllocTableSlot(tupleTable);
1412 ExecSetSlotDescriptor(slot, tupdesc, false);
1414 Nvacpagelist.num_pages = 0;
1415 num_fraged_pages = fraged_pages->num_pages;
1416 Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
1417 vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1418 if (vacuumed_pages > 0)
1420 /* get last reaped page from vacuum_pages */
1421 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1422 last_vacuum_block = last_vacuum_page->blkno;
1426 last_vacuum_page = NULL;
1427 last_vacuum_block = InvalidBlockNumber;
1429 cur_buffer = InvalidBuffer;
1432 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1433 vacpage->offsets_used = vacpage->offsets_free = 0;
1436 * Scan pages backwards from the last nonempty page, trying to move
1437 * tuples down to lower pages. Quit when we reach a page that we have
1438 * moved any tuples onto, or the first page if we haven't moved
1439 * anything, or when we find a page we cannot completely empty (this
1440 * last condition is handled by "break" statements within the loop).
1442 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1443 * in order by blkno.
1445 nblocks = vacrelstats->rel_pages;
1446 for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1447 blkno > last_move_dest_block;
1450 CHECK_FOR_INTERRUPTS();
1453 * Forget fraged_pages pages at or after this one; they're no
1454 * longer useful as move targets, since we only want to move down.
1455 * Note that since we stop the outer loop at last_move_dest_block,
1456 * pages removed here cannot have had anything moved onto them
1459 * Also note that we don't change the stored fraged_pages list, only
1460 * our local variable num_fraged_pages; so the forgotten pages are
1461 * still available to be loaded into the free space map later.
1463 while (num_fraged_pages > 0 &&
1464 fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1466 Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1471 * Process this page of relation.
1473 buf = ReadBuffer(onerel, blkno);
1474 page = BufferGetPage(buf);
1476 vacpage->offsets_free = 0;
1478 isempty = PageIsEmpty(page);
1482 /* Is the page in the vacuum_pages list? */
1483 if (blkno == last_vacuum_block)
1485 if (last_vacuum_page->offsets_free > 0)
1487 /* there are dead tuples on this page - clean them */
1489 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1490 vacuum_page(onerel, buf, last_vacuum_page);
1491 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1497 if (vacuumed_pages > 0)
1499 /* get prev reaped page from vacuum_pages */
1500 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1501 last_vacuum_block = last_vacuum_page->blkno;
1505 last_vacuum_page = NULL;
1506 last_vacuum_block = InvalidBlockNumber;
1517 chain_tuple_moved = false; /* no one chain-tuple was moved
1518 * off this page, yet */
1519 vacpage->blkno = blkno;
1520 maxoff = PageGetMaxOffsetNumber(page);
1521 for (offnum = FirstOffsetNumber;
1523 offnum = OffsetNumberNext(offnum))
1525 itemid = PageGetItemId(page, offnum);
1527 if (!ItemIdIsUsed(itemid))
1530 tuple.t_datamcxt = NULL;
1531 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1532 tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1533 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1535 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1537 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1538 elog(ERROR, "HEAP_MOVED_IN was not expected");
1541 * If this (chain) tuple is moved by me already then I
1542 * have to check is it in vacpage or not - i.e. is it
1543 * moved while cleaning this page or some previous one.
1545 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1547 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
1548 elog(ERROR, "Invalid XVAC in tuple header");
1549 if (keep_tuples == 0)
1551 if (chain_tuple_moved) /* some chains was moved
1553 { /* cleaning this page */
1554 Assert(vacpage->offsets_free > 0);
1555 for (i = 0; i < vacpage->offsets_free; i++)
1557 if (vacpage->offsets[i] == offnum)
1560 if (i >= vacpage->offsets_free) /* not found */
1562 vacpage->offsets[vacpage->offsets_free++] = offnum;
1568 vacpage->offsets[vacpage->offsets_free++] = offnum;
1573 elog(ERROR, "HEAP_MOVED_OFF was expected");
1577 * If this tuple is in the chain of tuples created in updates
1578 * by "recent" transactions then we have to move all chain of
1579 * tuples to another places.
1581 if ((tuple.t_data->t_infomask & HEAP_UPDATED &&
1582 !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1584 (!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1585 !(ItemPointerEquals(&(tuple.t_self),
1586 &(tuple.t_data->t_ctid)))))
1591 ItemPointerData Ctid;
1592 HeapTupleData tp = tuple;
1593 Size tlen = tuple_len;
1594 VTupleMove vtmove = (VTupleMove)
1595 palloc(100 * sizeof(VTupleMoveData));
1597 int free_vtmove = 100;
1598 VacPage to_vacpage = NULL;
1600 bool freeCbuf = false;
1603 if (vacrelstats->vtlinks == NULL)
1604 elog(ERROR, "No one parent tuple was found");
1605 if (cur_buffer != InvalidBuffer)
1607 WriteBuffer(cur_buffer);
1608 cur_buffer = InvalidBuffer;
1612 * If this tuple is in the begin/middle of the chain then
1613 * we have to move to the end of chain.
1615 while (!(tp.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1616 !(ItemPointerEquals(&(tp.t_self),
1617 &(tp.t_data->t_ctid))))
1619 Ctid = tp.t_data->t_ctid;
1621 ReleaseBuffer(Cbuf);
1623 Cbuf = ReadBuffer(onerel,
1624 ItemPointerGetBlockNumber(&Ctid));
1625 Cpage = BufferGetPage(Cbuf);
1626 Citemid = PageGetItemId(Cpage,
1627 ItemPointerGetOffsetNumber(&Ctid));
1628 if (!ItemIdIsUsed(Citemid))
1631 * This means that in the middle of chain there
1632 * was tuple updated by older (than OldestXmin)
1633 * xaction and this tuple is already deleted by
1634 * me. Actually, upper part of chain should be
1635 * removed and seems that this should be handled
1636 * in scan_heap(), but it's not implemented at the
1637 * moment and so we just stop shrinking here.
1639 ReleaseBuffer(Cbuf);
1642 elog(WARNING, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1645 tp.t_datamcxt = NULL;
1646 tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1648 tlen = tp.t_len = ItemIdGetLength(Citemid);
1652 /* first, can chain be moved ? */
1655 if (to_vacpage == NULL ||
1656 !enough_space(to_vacpage, tlen))
1658 for (i = 0; i < num_fraged_pages; i++)
1660 if (enough_space(fraged_pages->pagedesc[i], tlen))
1664 if (i == num_fraged_pages)
1666 /* can't move item anywhere */
1667 for (i = 0; i < num_vtmove; i++)
1669 Assert(vtmove[i].vacpage->offsets_used > 0);
1670 (vtmove[i].vacpage->offsets_used)--;
1676 to_vacpage = fraged_pages->pagedesc[to_item];
1678 to_vacpage->free -= MAXALIGN(tlen);
1679 if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1680 to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
1681 (to_vacpage->offsets_used)++;
1682 if (free_vtmove == 0)
1685 vtmove = (VTupleMove) repalloc(vtmove,
1686 (free_vtmove + num_vtmove) *
1687 sizeof(VTupleMoveData));
1689 vtmove[num_vtmove].tid = tp.t_self;
1690 vtmove[num_vtmove].vacpage = to_vacpage;
1691 if (to_vacpage->offsets_used == 1)
1692 vtmove[num_vtmove].cleanVpd = true;
1694 vtmove[num_vtmove].cleanVpd = false;
1699 if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1700 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
1704 /* Well, try to find tuple with old row version */
1711 VTupleLinkData vtld,
1714 vtld.new_tid = tp.t_self;
1716 vac_bsearch((void *) &vtld,
1717 (void *) (vacrelstats->vtlinks),
1718 vacrelstats->num_vtlinks,
1719 sizeof(VTupleLinkData),
1722 elog(ERROR, "Parent tuple was not found");
1723 tp.t_self = vtlp->this_tid;
1724 Pbuf = ReadBuffer(onerel,
1725 ItemPointerGetBlockNumber(&(tp.t_self)));
1726 Ppage = BufferGetPage(Pbuf);
1727 Pitemid = PageGetItemId(Ppage,
1728 ItemPointerGetOffsetNumber(&(tp.t_self)));
1729 if (!ItemIdIsUsed(Pitemid))
1730 elog(ERROR, "Parent itemid marked as unused");
1731 Ptp.t_datamcxt = NULL;
1732 Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1733 Assert(ItemPointerEquals(&(vtld.new_tid),
1734 &(Ptp.t_data->t_ctid)));
1737 * Read above about cases when
1738 * !ItemIdIsUsed(Citemid) (child item is
1739 * removed)... Due to the fact that at the moment
1740 * we don't remove unuseful part of update-chain,
1741 * it's possible to get too old parent row here.
1742 * Like as in the case which caused this problem,
1743 * we stop shrinking here. I could try to find
1744 * real parent row but want not to do it because
1745 * of real solution will be implemented anyway,
1746 * latter, and we are too close to 6.5 release. -
1749 if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
1750 HeapTupleHeaderGetXmin(tp.t_data))))
1753 ReleaseBuffer(Cbuf);
1755 ReleaseBuffer(Pbuf);
1756 for (i = 0; i < num_vtmove; i++)
1758 Assert(vtmove[i].vacpage->offsets_used > 0);
1759 (vtmove[i].vacpage->offsets_used)--;
1762 elog(WARNING, "Too old parent tuple found - can't continue repair_frag");
1765 #ifdef NOT_USED /* I'm not sure that this will wotk
1769 * If this tuple is updated version of row and it
1770 * was created by the same transaction then no one
1771 * is interested in this tuple - mark it as
1774 if (Ptp.t_data->t_infomask & HEAP_UPDATED &&
1775 TransactionIdEquals(HeapTupleHeaderGetXmin(Ptp.t_data),
1776 HeapTupleHeaderGetXmax(Ptp.t_data)))
1778 Ptp.t_data->t_infomask &=
1779 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1780 Ptp.t_data->t_infomask |= HEAP_MOVED_OFF;
1781 HeapTupleHeaderSetXvac(Ptp.t_data, myXID);
1786 tp.t_datamcxt = Ptp.t_datamcxt;
1787 tp.t_data = Ptp.t_data;
1788 tlen = tp.t_len = ItemIdGetLength(Pitemid);
1790 ReleaseBuffer(Cbuf);
1795 if (num_vtmove == 0)
1799 ReleaseBuffer(Cbuf);
1800 if (num_vtmove == 0) /* chain can't be moved */
1805 ItemPointerSetInvalid(&Ctid);
1806 for (ti = 0; ti < num_vtmove; ti++)
1808 VacPage destvacpage = vtmove[ti].vacpage;
1810 /* Get page to move from */
1811 tuple.t_self = vtmove[ti].tid;
1812 Cbuf = ReadBuffer(onerel,
1813 ItemPointerGetBlockNumber(&(tuple.t_self)));
1815 /* Get page to move to */
1816 cur_buffer = ReadBuffer(onerel, destvacpage->blkno);
1818 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1819 if (cur_buffer != Cbuf)
1820 LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
1822 ToPage = BufferGetPage(cur_buffer);
1823 Cpage = BufferGetPage(Cbuf);
1825 Citemid = PageGetItemId(Cpage,
1826 ItemPointerGetOffsetNumber(&(tuple.t_self)));
1827 tuple.t_datamcxt = NULL;
1828 tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1829 tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1832 * make a copy of the source tuple, and then mark the
1833 * source tuple MOVED_OFF.
1835 heap_copytuple_with_tuple(&tuple, &newtup);
1838 * register invalidation of source tuple in catcaches.
1840 CacheInvalidateHeapTuple(onerel, &tuple);
1842 /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
1843 START_CRIT_SECTION();
1845 tuple.t_data->t_infomask &=
1846 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1847 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1848 HeapTupleHeaderSetXvac(tuple.t_data, myXID);
1851 * If this page was not used before - clean it.
1853 * NOTE: a nasty bug used to lurk here. It is possible
1854 * for the source and destination pages to be the same
1855 * (since this tuple-chain member can be on a page
1856 * lower than the one we're currently processing in
1857 * the outer loop). If that's true, then after
1858 * vacuum_page() the source tuple will have been
1859 * moved, and tuple.t_data will be pointing at
1860 * garbage. Therefore we must do everything that uses
1861 * tuple.t_data BEFORE this step!!
1863 * This path is different from the other callers of
1864 * vacuum_page, because we have already incremented
1865 * the vacpage's offsets_used field to account for the
1866 * tuple(s) we expect to move onto the page. Therefore
1867 * vacuum_page's check for offsets_used == 0 is wrong.
1868 * But since that's a good debugging check for all
1869 * other callers, we work around it here rather than
1872 if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1874 int sv_offsets_used = destvacpage->offsets_used;
1876 destvacpage->offsets_used = 0;
1877 vacuum_page(onerel, cur_buffer, destvacpage);
1878 destvacpage->offsets_used = sv_offsets_used;
1882 * Update the state of the copied tuple, and store it
1883 * on the destination page.
1885 newtup.t_data->t_infomask &=
1886 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1887 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1888 HeapTupleHeaderSetXvac(newtup.t_data, myXID);
1889 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1890 InvalidOffsetNumber, LP_USED);
1891 if (newoff == InvalidOffsetNumber)
1893 elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
1894 (unsigned long) tuple_len, destvacpage->blkno);
1896 newitemid = PageGetItemId(ToPage, newoff);
1897 pfree(newtup.t_data);
1898 newtup.t_datamcxt = NULL;
1899 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1900 ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
1903 if (!onerel->rd_istemp)
1906 log_heap_move(onerel, Cbuf, tuple.t_self,
1907 cur_buffer, &newtup);
1909 if (Cbuf != cur_buffer)
1911 PageSetLSN(Cpage, recptr);
1912 PageSetSUI(Cpage, ThisStartUpID);
1914 PageSetLSN(ToPage, recptr);
1915 PageSetSUI(ToPage, ThisStartUpID);
1919 /* No XLOG record, but still need to flag that XID exists on disk */
1920 MyXactMadeTempRelUpdate = true;
1925 if (destvacpage->blkno > last_move_dest_block)
1926 last_move_dest_block = destvacpage->blkno;
1929 * Set new tuple's t_ctid pointing to itself for last
1930 * tuple in chain, and to next tuple in chain
1933 if (!ItemPointerIsValid(&Ctid))
1934 newtup.t_data->t_ctid = newtup.t_self;
1936 newtup.t_data->t_ctid = Ctid;
1937 Ctid = newtup.t_self;
1942 * Remember that we moved tuple from the current page
1943 * (corresponding index tuple will be cleaned).
1946 vacpage->offsets[vacpage->offsets_free++] =
1947 ItemPointerGetOffsetNumber(&(tuple.t_self));
1951 LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
1952 if (cur_buffer != Cbuf)
1953 LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);
1955 /* Create index entries for the moved tuple */
1956 if (resultRelInfo->ri_NumIndices > 0)
1958 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
1959 ExecInsertIndexTuples(slot, &(newtup.t_self),
1963 WriteBuffer(cur_buffer);
1966 cur_buffer = InvalidBuffer;
1968 chain_tuple_moved = true;
1972 /* try to find new page for this tuple */
1973 if (cur_buffer == InvalidBuffer ||
1974 !enough_space(cur_page, tuple_len))
1976 if (cur_buffer != InvalidBuffer)
1978 WriteBuffer(cur_buffer);
1979 cur_buffer = InvalidBuffer;
1981 for (i = 0; i < num_fraged_pages; i++)
1983 if (enough_space(fraged_pages->pagedesc[i], tuple_len))
1986 if (i == num_fraged_pages)
1987 break; /* can't move item anywhere */
1989 cur_page = fraged_pages->pagedesc[cur_item];
1990 cur_buffer = ReadBuffer(onerel, cur_page->blkno);
1991 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1992 ToPage = BufferGetPage(cur_buffer);
1993 /* if this page was not used before - clean it */
1994 if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
1995 vacuum_page(onerel, cur_buffer, cur_page);
1998 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
2000 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2003 heap_copytuple_with_tuple(&tuple, &newtup);
2006 * register invalidation of source tuple in catcaches.
2008 * (Note: we do not need to register the copied tuple,
2009 * because we are not changing the tuple contents and
2010 * so there cannot be any need to flush negative
2011 * catcache entries.)
2013 CacheInvalidateHeapTuple(onerel, &tuple);
2015 /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
2016 START_CRIT_SECTION();
2019 * Mark new tuple as moved_in by vacuum and store vacuum XID
2022 newtup.t_data->t_infomask &=
2023 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
2024 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2025 HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2027 /* add tuple to the page */
2028 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
2029 InvalidOffsetNumber, LP_USED);
2030 if (newoff == InvalidOffsetNumber)
2032 elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2033 (unsigned long) tuple_len,
2034 cur_page->blkno, (unsigned long) cur_page->free,
2035 cur_page->offsets_used, cur_page->offsets_free);
2037 newitemid = PageGetItemId(ToPage, newoff);
2038 pfree(newtup.t_data);
2039 newtup.t_datamcxt = NULL;
2040 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
2041 ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
2042 newtup.t_self = newtup.t_data->t_ctid;
2045 * Mark old tuple as moved_off by vacuum and store vacuum XID
2048 tuple.t_data->t_infomask &=
2049 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
2050 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
2051 HeapTupleHeaderSetXvac(tuple.t_data, myXID);
2054 if (!onerel->rd_istemp)
2057 log_heap_move(onerel, buf, tuple.t_self,
2058 cur_buffer, &newtup);
2060 PageSetLSN(page, recptr);
2061 PageSetSUI(page, ThisStartUpID);
2062 PageSetLSN(ToPage, recptr);
2063 PageSetSUI(ToPage, ThisStartUpID);
2067 /* No XLOG record, but still need to flag that XID exists on disk */
2068 MyXactMadeTempRelUpdate = true;
2073 cur_page->offsets_used++;
2075 cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
2076 if (cur_page->blkno > last_move_dest_block)
2077 last_move_dest_block = cur_page->blkno;
2079 vacpage->offsets[vacpage->offsets_free++] = offnum;
2081 LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
2082 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2084 /* insert index' tuples if needed */
2085 if (resultRelInfo->ri_NumIndices > 0)
2087 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
2088 ExecInsertIndexTuples(slot, &(newtup.t_self), estate, true);
2090 } /* walk along page */
2092 if (offnum < maxoff && keep_tuples > 0)
2096 for (off = OffsetNumberNext(offnum);
2098 off = OffsetNumberNext(off))
2100 itemid = PageGetItemId(page, off);
2101 if (!ItemIdIsUsed(itemid))
2103 tuple.t_datamcxt = NULL;
2104 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2105 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
2107 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2108 elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
2109 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2111 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2112 elog(ERROR, "Invalid XVAC in tuple header (4)");
2113 /* some chains was moved while */
2114 if (chain_tuple_moved)
2115 { /* cleaning this page */
2116 Assert(vacpage->offsets_free > 0);
2117 for (i = 0; i < vacpage->offsets_free; i++)
2119 if (vacpage->offsets[i] == off)
2122 if (i >= vacpage->offsets_free) /* not found */
2124 vacpage->offsets[vacpage->offsets_free++] = off;
2125 Assert(keep_tuples > 0);
2131 vacpage->offsets[vacpage->offsets_free++] = off;
2132 Assert(keep_tuples > 0);
2137 elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2141 if (vacpage->offsets_free > 0) /* some tuples were moved */
2143 if (chain_tuple_moved) /* else - they are ordered */
2145 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
2146 sizeof(OffsetNumber), vac_cmp_offno);
2148 vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2156 if (offnum <= maxoff)
2157 break; /* some item(s) left */
2159 } /* walk along relation */
2161 blkno++; /* new number of blocks */
2163 if (cur_buffer != InvalidBuffer)
2165 Assert(num_moved > 0);
2166 WriteBuffer(cur_buffer);
2172 * We have to commit our tuple movings before we truncate the
2173 * relation. Ideally we should do Commit/StartTransactionCommand
2174 * here, relying on the session-level table lock to protect our
2175 * exclusive access to the relation. However, that would require
2176 * a lot of extra code to close and re-open the relation, indexes,
2177 * etc. For now, a quick hack: record status of current
2178 * transaction as committed, and continue.
2180 RecordTransactionCommit();
2184 * We are not going to move any more tuples across pages, but we still
2185 * need to apply vacuum_page to compact free space in the remaining
2186 * pages in vacuum_pages list. Note that some of these pages may also
2187 * be in the fraged_pages list, and may have had tuples moved onto
2188 * them; if so, we already did vacuum_page and needn't do it again.
2190 for (i = 0, curpage = vacuum_pages->pagedesc;
2194 CHECK_FOR_INTERRUPTS();
2195 Assert((*curpage)->blkno < blkno);
2196 if ((*curpage)->offsets_used == 0)
2198 /* this page was not used as a move target, so must clean it */
2199 buf = ReadBuffer(onerel, (*curpage)->blkno);
2200 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2201 page = BufferGetPage(buf);
2202 if (!PageIsEmpty(page))
2203 vacuum_page(onerel, buf, *curpage);
2204 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2210 * Now scan all the pages that we moved tuples onto and update tuple
2211 * status bits. This is not really necessary, but will save time for
2212 * future transactions examining these tuples.
2214 * XXX WARNING that this code fails to clear HEAP_MOVED_OFF tuples from
2215 * pages that were move source pages but not move dest pages. One
2216 * also wonders whether it wouldn't be better to skip this step and
2217 * let the tuple status updates happen someplace that's not holding an
2218 * exclusive lock on the relation.
2221 for (i = 0, curpage = fraged_pages->pagedesc;
2222 i < num_fraged_pages;
2225 CHECK_FOR_INTERRUPTS();
2226 Assert((*curpage)->blkno < blkno);
2227 if ((*curpage)->blkno > last_move_dest_block)
2228 break; /* no need to scan any further */
2229 if ((*curpage)->offsets_used == 0)
2230 continue; /* this page was never used as a move dest */
2231 buf = ReadBuffer(onerel, (*curpage)->blkno);
2232 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2233 page = BufferGetPage(buf);
2235 max_offset = PageGetMaxOffsetNumber(page);
2236 for (newoff = FirstOffsetNumber;
2237 newoff <= max_offset;
2238 newoff = OffsetNumberNext(newoff))
2240 itemid = PageGetItemId(page, newoff);
2241 if (!ItemIdIsUsed(itemid))
2243 tuple.t_datamcxt = NULL;
2244 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2245 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2247 if (!(tuple.t_data->t_infomask & HEAP_MOVED))
2248 elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2249 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2250 elog(ERROR, "Invalid XVAC in tuple header (2)");
2251 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2253 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
2254 tuple.t_data->t_infomask &= ~HEAP_MOVED;
2258 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
2261 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2263 Assert((*curpage)->offsets_used == num_tuples);
2264 checked_moved += num_tuples;
2266 Assert(num_moved == checked_moved);
2268 elog(elevel, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u.\n\t%s",
2269 RelationGetRelationName(onerel),
2270 nblocks, blkno, num_moved,
2271 vac_show_rusage(&ru0));
2274 * Reflect the motion of system tuples to catalog cache here.
2276 CommandCounterIncrement();
2278 if (Nvacpagelist.num_pages > 0)
2280 /* vacuum indexes again if needed */
2281 if (Irel != (Relation *) NULL)
2287 /* re-sort Nvacpagelist.pagedesc */
2288 for (vpleft = Nvacpagelist.pagedesc,
2289 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2290 vpleft < vpright; vpleft++, vpright--)
2296 Assert(keep_tuples >= 0);
2297 for (i = 0; i < nindexes; i++)
2298 vacuum_index(&Nvacpagelist, Irel[i],
2299 vacrelstats->rel_tuples, keep_tuples);
2302 /* clean moved tuples from last page in Nvacpagelist list */
2303 if (vacpage->blkno == (blkno - 1) &&
2304 vacpage->offsets_free > 0)
2306 OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2307 OffsetNumber *unused = unbuf;
2310 buf = ReadBuffer(onerel, vacpage->blkno);
2311 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2312 page = BufferGetPage(buf);
2314 maxoff = PageGetMaxOffsetNumber(page);
2315 for (offnum = FirstOffsetNumber;
2317 offnum = OffsetNumberNext(offnum))
2319 itemid = PageGetItemId(page, offnum);
2320 if (!ItemIdIsUsed(itemid))
2322 tuple.t_datamcxt = NULL;
2323 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2325 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2327 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2329 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2330 elog(ERROR, "Invalid XVAC in tuple header (3)");
2331 itemid->lp_flags &= ~LP_USED;
2335 elog(ERROR, "HEAP_MOVED_OFF was expected (3)");
2339 Assert(vacpage->offsets_free == num_tuples);
2341 START_CRIT_SECTION();
2343 uncnt = PageRepairFragmentation(page, unused);
2346 if (!onerel->rd_istemp)
2350 recptr = log_heap_clean(onerel, buf, (char *) unused,
2351 (char *) (&(unused[uncnt])) - (char *) unused);
2352 PageSetLSN(page, recptr);
2353 PageSetSUI(page, ThisStartUpID);
2357 /* No XLOG record, but still need to flag that XID exists on disk */
2358 MyXactMadeTempRelUpdate = true;
2363 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2367 /* now - free new list of reaped pages */
2368 curpage = Nvacpagelist.pagedesc;
2369 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
2371 pfree(Nvacpagelist.pagedesc);
2375 * Flush dirty pages out to disk. We do this unconditionally, even if
2376 * we don't need to truncate, because we want to ensure that all
2377 * tuples have correct on-row commit status on disk (see bufmgr.c's
2378 * comments for FlushRelationBuffers()).
2380 i = FlushRelationBuffers(onerel, blkno);
2382 elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
2385 /* truncate relation, if needed */
2386 if (blkno < nblocks)
2388 blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2389 onerel->rd_nblocks = blkno; /* update relcache immediately */
2390 onerel->rd_targblock = InvalidBlockNumber;
2391 vacrelstats->rel_pages = blkno; /* set new number of blocks */
2396 if (vacrelstats->vtlinks != NULL)
2397 pfree(vacrelstats->vtlinks);
2399 ExecDropTupleTable(tupleTable, true);
2401 ExecCloseIndices(resultRelInfo);
2405 * vacuum_heap() -- free dead tuples
2407 * This routine marks dead tuples as unused and truncates relation
2408 * if there are "empty" end-blocks.
2411 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
2415 BlockNumber relblocks;
2419 nblocks = vacuum_pages->num_pages;
2420 nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with them */
2422 for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2424 CHECK_FOR_INTERRUPTS();
2425 if ((*vacpage)->offsets_free > 0)
2427 buf = ReadBuffer(onerel, (*vacpage)->blkno);
2428 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2429 vacuum_page(onerel, buf, *vacpage);
2430 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2436 * Flush dirty pages out to disk. We do this unconditionally, even if
2437 * we don't need to truncate, because we want to ensure that all
2438 * tuples have correct on-row commit status on disk (see bufmgr.c's
2439 * comments for FlushRelationBuffers()).
2441 Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2442 relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2444 i = FlushRelationBuffers(onerel, relblocks);
2446 elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
2449 /* truncate relation if there are some empty end-pages */
2450 if (vacuum_pages->empty_end_pages > 0)
2452 elog(elevel, "Rel %s: Pages: %u --> %u.",
2453 RelationGetRelationName(onerel),
2454 vacrelstats->rel_pages, relblocks);
2455 relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2456 onerel->rd_nblocks = relblocks; /* update relcache immediately */
2457 onerel->rd_targblock = InvalidBlockNumber;
2458 vacrelstats->rel_pages = relblocks; /* set new number of
2464 * vacuum_page() -- free dead tuples on a page
2465 * and repair its fragmentation.
2468 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
2470 OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2471 OffsetNumber *unused = unbuf;
2473 Page page = BufferGetPage(buffer);
2477 /* There shouldn't be any tuples moved onto the page yet! */
2478 Assert(vacpage->offsets_used == 0);
2480 START_CRIT_SECTION();
2482 for (i = 0; i < vacpage->offsets_free; i++)
2484 itemid = PageGetItemId(page, vacpage->offsets[i]);
2485 itemid->lp_flags &= ~LP_USED;
2488 uncnt = PageRepairFragmentation(page, unused);
2491 if (!onerel->rd_istemp)
2495 recptr = log_heap_clean(onerel, buffer, (char *) unused,
2496 (char *) (&(unused[uncnt])) - (char *) unused);
2497 PageSetLSN(page, recptr);
2498 PageSetSUI(page, ThisStartUpID);
2502 /* No XLOG record, but still need to flag that XID exists on disk */
2503 MyXactMadeTempRelUpdate = true;
2510 * scan_index() -- scan one index relation to update statistic.
2512 * We use this when we have no deletions to do.
2515 scan_index(Relation indrel, double num_tuples)
2517 IndexBulkDeleteResult *stats;
2520 vac_init_rusage(&ru0);
2523 * Even though we're not planning to delete anything, use the
2524 * ambulkdelete call, so that the scan happens within the index AM for
2527 stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2532 /* now update statistics in pg_class */
2533 vac_update_relstats(RelationGetRelid(indrel),
2534 stats->num_pages, stats->num_index_tuples,
2537 elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
2538 RelationGetRelationName(indrel),
2539 stats->num_pages, stats->num_index_tuples,
2540 vac_show_rusage(&ru0));
2543 * Check for tuple count mismatch. If the index is partial, then it's
2544 * OK for it to have fewer tuples than the heap; else we got trouble.
2546 if (stats->num_index_tuples != num_tuples)
2548 if (stats->num_index_tuples > num_tuples ||
2549 !vac_is_partial_index(indrel))
2550 elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2551 \n\tRecreate the index.",
2552 RelationGetRelationName(indrel),
2553 stats->num_index_tuples, num_tuples);
2560 * vacuum_index() -- vacuum one index relation.
2562 * Vpl is the VacPageList of the heap we're currently vacuuming.
2563 * It's locked. Indrel is an index relation on the vacuumed heap.
2565 * We don't bother to set locks on the index relation here, since
2566 * the parent table is exclusive-locked already.
2568 * Finally, we arrange to update the index relation's statistics in
2572 vacuum_index(VacPageList vacpagelist, Relation indrel,
2573 double num_tuples, int keep_tuples)
2575 IndexBulkDeleteResult *stats;
2578 vac_init_rusage(&ru0);
2580 /* Do bulk deletion */
2581 stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2586 /* now update statistics in pg_class */
2587 vac_update_relstats(RelationGetRelid(indrel),
2588 stats->num_pages, stats->num_index_tuples,
2591 elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
2592 RelationGetRelationName(indrel), stats->num_pages,
2593 stats->num_index_tuples - keep_tuples, stats->tuples_removed,
2594 vac_show_rusage(&ru0));
2597 * Check for tuple count mismatch. If the index is partial, then it's
2598 * OK for it to have fewer tuples than the heap; else we got trouble.
2600 if (stats->num_index_tuples != num_tuples + keep_tuples)
2602 if (stats->num_index_tuples > num_tuples + keep_tuples ||
2603 !vac_is_partial_index(indrel))
2604 elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2605 \n\tRecreate the index.",
2606 RelationGetRelationName(indrel),
2607 stats->num_index_tuples, num_tuples);
2614 * tid_reaped() -- is a particular tid reaped?
2616 * This has the right signature to be an IndexBulkDeleteCallback.
2618 * vacpagelist->VacPage_array is sorted in right order.
2621 tid_reaped(ItemPointer itemptr, void *state)
2623 VacPageList vacpagelist = (VacPageList) state;
2624 OffsetNumber ioffno;
2628 VacPageData vacpage;
2630 vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2631 ioffno = ItemPointerGetOffsetNumber(itemptr);
2634 vpp = (VacPage *) vac_bsearch((void *) &vp,
2635 (void *) (vacpagelist->pagedesc),
2636 vacpagelist->num_pages,
2643 /* ok - we are on a partially or fully reaped page */
2646 if (vp->offsets_free == 0)
2648 /* this is EmptyPage, so claim all tuples on it are reaped!!! */
2652 voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
2653 (void *) (vp->offsets),
2655 sizeof(OffsetNumber),
2666 * Dummy version for scan_index.
2669 dummy_tid_reaped(ItemPointer itemptr, void *state)
2675 * Update the shared Free Space Map with the info we now have about
2676 * free space in the relation, discarding any old info the map may have.
2679 vac_update_fsm(Relation onerel, VacPageList fraged_pages,
2680 BlockNumber rel_pages)
2682 int nPages = fraged_pages->num_pages;
2687 /* +1 to avoid palloc(0) */
2688 pages = (BlockNumber *) palloc((nPages + 1) * sizeof(BlockNumber));
2689 spaceAvail = (Size *) palloc((nPages + 1) * sizeof(Size));
2691 for (i = 0; i < nPages; i++)
2693 pages[i] = fraged_pages->pagedesc[i]->blkno;
2694 spaceAvail[i] = fraged_pages->pagedesc[i]->free;
2697 * fraged_pages may contain entries for pages that we later
2698 * decided to truncate from the relation; don't enter them into
2701 if (pages[i] >= rel_pages)
2708 MultiRecordFreeSpace(&onerel->rd_node,
2710 nPages, pages, spaceAvail);
2715 /* Copy a VacPage structure */
2717 copy_vac_page(VacPage vacpage)
2721 /* allocate a VacPageData entry */
2722 newvacpage = (VacPage) palloc(sizeof(VacPageData) +
2723 vacpage->offsets_free * sizeof(OffsetNumber));
2726 if (vacpage->offsets_free > 0)
2727 memcpy(newvacpage->offsets, vacpage->offsets,
2728 vacpage->offsets_free * sizeof(OffsetNumber));
2729 newvacpage->blkno = vacpage->blkno;
2730 newvacpage->free = vacpage->free;
2731 newvacpage->offsets_used = vacpage->offsets_used;
2732 newvacpage->offsets_free = vacpage->offsets_free;
2738 * Add a VacPage pointer to a VacPageList.
2740 * As a side effect of the way that scan_heap works,
2741 * higher pages come after lower pages in the array
2742 * (and highest tid on a page is last).
2745 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
2747 #define PG_NPAGEDESC 1024
2749 /* allocate a VacPage entry if needed */
2750 if (vacpagelist->num_pages == 0)
2752 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
2753 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
2755 else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
2757 vacpagelist->num_allocated_pages *= 2;
2758 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
2760 vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
2761 (vacpagelist->num_pages)++;
2765 * vac_bsearch: just like standard C library routine bsearch(),
2766 * except that we first test to see whether the target key is outside
2767 * the range of the table entries. This case is handled relatively slowly
2768 * by the normal binary search algorithm (ie, no faster than any other key)
2769 * but it occurs often enough in VACUUM to be worth optimizing.
2772 vac_bsearch(const void *key, const void *base,
2773 size_t nelem, size_t size,
2774 int (*compar) (const void *, const void *))
2781 res = compar(key, base);
2785 return (void *) base;
2788 last = (const void *) ((const char *) base + (nelem - 1) * size);
2789 res = compar(key, last);
2793 return (void *) last;
2796 return NULL; /* already checked 'em all */
2797 return bsearch(key, base, nelem, size, compar);
2801 * Comparator routines for use with qsort() and bsearch().
2804 vac_cmp_blk(const void *left, const void *right)
2809 lblk = (*((VacPage *) left))->blkno;
2810 rblk = (*((VacPage *) right))->blkno;
2820 vac_cmp_offno(const void *left, const void *right)
2822 if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2824 if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2830 vac_cmp_vtlinks(const void *left, const void *right)
2832 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
2833 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2835 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
2836 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2838 /* bi_hi-es are equal */
2839 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
2840 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2842 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
2843 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2845 /* bi_lo-es are equal */
2846 if (((VTupleLink) left)->new_tid.ip_posid <
2847 ((VTupleLink) right)->new_tid.ip_posid)
2849 if (((VTupleLink) left)->new_tid.ip_posid >
2850 ((VTupleLink) right)->new_tid.ip_posid)
2857 vac_open_indexes(Relation relation, int *nindexes, Relation **Irel)
2863 indexoidlist = RelationGetIndexList(relation);
2865 *nindexes = length(indexoidlist);
2868 *Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
2873 foreach(indexoidscan, indexoidlist)
2875 Oid indexoid = lfirsti(indexoidscan);
2877 (*Irel)[i] = index_open(indexoid);
2881 freeList(indexoidlist);
2886 vac_close_indexes(int nindexes, Relation *Irel)
2888 if (Irel == (Relation *) NULL)
2892 index_close(Irel[nindexes]);
2898 * Is an index partial (ie, could it contain fewer tuples than the heap?)
2901 vac_is_partial_index(Relation indrel)
2904 * If the index's AM doesn't support nulls, it's partial for our
2907 if (!indrel->rd_am->amindexnulls)
2910 /* Otherwise, look to see if there's a partial-index predicate */
2911 return (VARSIZE(&indrel->rd_index->indpred) > VARHDRSZ);
2916 enough_space(VacPage vacpage, Size len)
2918 len = MAXALIGN(len);
2920 if (len > vacpage->free)
2923 /* if there are free itemid(s) and len <= free_space... */
2924 if (vacpage->offsets_used < vacpage->offsets_free)
2927 /* noff_used >= noff_free and so we'll have to allocate new itemid */
2928 if (len + sizeof(ItemIdData) <= vacpage->free)
2936 * Initialize usage snapshot.
2939 vac_init_rusage(VacRUsage *ru0)
2943 getrusage(RUSAGE_SELF, &ru0->ru);
2944 gettimeofday(&ru0->tv, &tz);
2948 * Compute elapsed time since ru0 usage snapshot, and format into
2949 * a displayable string. Result is in a static string, which is
2950 * tacky, but no one ever claimed that the Postgres backend is
2954 vac_show_rusage(VacRUsage *ru0)
2956 static char result[100];
2959 vac_init_rusage(&ru1);
2961 if (ru1.tv.tv_usec < ru0->tv.tv_usec)
2964 ru1.tv.tv_usec += 1000000;
2966 if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
2968 ru1.ru.ru_stime.tv_sec--;
2969 ru1.ru.ru_stime.tv_usec += 1000000;
2971 if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
2973 ru1.ru.ru_utime.tv_sec--;
2974 ru1.ru.ru_utime.tv_usec += 1000000;
2977 snprintf(result, sizeof(result),
2978 "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
2979 (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
2980 (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
2981 (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
2982 (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
2983 (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
2984 (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);