1 /*-------------------------------------------------------------------------
4 * The postgres vacuum cleaner.
6 * This file includes the "full" version of VACUUM, as well as control code
7 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE. See
8 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
11 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
16 * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.221 2002/04/02 01:03:05 tgl Exp $
18 *-------------------------------------------------------------------------
24 #include "access/clog.h"
25 #include "access/genam.h"
26 #include "access/heapam.h"
27 #include "access/xlog.h"
28 #include "catalog/catalog.h"
29 #include "catalog/catname.h"
30 #include "catalog/namespace.h"
31 #include "catalog/pg_database.h"
32 #include "catalog/pg_index.h"
33 #include "commands/vacuum.h"
34 #include "executor/executor.h"
35 #include "miscadmin.h"
36 #include "storage/freespace.h"
37 #include "storage/sinval.h"
38 #include "storage/smgr.h"
39 #include "tcop/pquery.h"
40 #include "utils/acl.h"
41 #include "utils/builtins.h"
42 #include "utils/fmgroids.h"
43 #include "utils/inval.h"
44 #include "utils/lsyscache.h"
45 #include "utils/relcache.h"
46 #include "utils/syscache.h"
50 typedef struct VacPageData
52 BlockNumber blkno; /* BlockNumber of this Page */
53 Size free; /* FreeSpace on this Page */
54 uint16 offsets_used; /* Number of OffNums used by vacuum */
55 uint16 offsets_free; /* Number of OffNums free or to be free */
56 OffsetNumber offsets[1]; /* Array of free OffNums */
59 typedef VacPageData *VacPage;
61 typedef struct VacPageListData
63 BlockNumber empty_end_pages; /* Number of "empty" end-pages */
64 int num_pages; /* Number of pages in pagedesc */
65 int num_allocated_pages; /* Number of allocated pages in
67 VacPage *pagedesc; /* Descriptions of pages */
70 typedef VacPageListData *VacPageList;
72 typedef struct VTupleLinkData
74 ItemPointerData new_tid;
75 ItemPointerData this_tid;
78 typedef VTupleLinkData *VTupleLink;
80 typedef struct VTupleMoveData
82 ItemPointerData tid; /* tuple ID */
83 VacPage vacpage; /* where to move */
84 bool cleanVpd; /* clean vacpage before using */
87 typedef VTupleMoveData *VTupleMove;
89 typedef struct VRelStats
91 BlockNumber rel_pages;
101 static MemoryContext vac_context = NULL;
103 static int elevel = -1;
105 static TransactionId OldestXmin;
106 static TransactionId FreezeLimit;
108 static TransactionId initialOldestXmin;
109 static TransactionId initialFreezeLimit;
112 /* non-export function prototypes */
113 static void vacuum_init(VacuumStmt *vacstmt);
114 static void vacuum_shutdown(VacuumStmt *vacstmt);
115 static List *getrels(const RangeVar *vacrel, const char *stmttype);
116 static void vac_update_dbstats(Oid dbid,
117 TransactionId vacuumXID,
118 TransactionId frozenXID);
119 static void vac_truncate_clog(TransactionId vacuumXID,
120 TransactionId frozenXID);
121 static void vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
122 static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
123 static void scan_heap(VRelStats *vacrelstats, Relation onerel,
124 VacPageList vacuum_pages, VacPageList fraged_pages);
125 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
126 VacPageList vacuum_pages, VacPageList fraged_pages,
127 int nindexes, Relation *Irel);
128 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
129 VacPageList vacpagelist);
130 static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
131 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
132 double num_tuples, int keep_tuples);
133 static void scan_index(Relation indrel, double num_tuples);
134 static bool tid_reaped(ItemPointer itemptr, void *state);
135 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
136 static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
137 BlockNumber rel_pages);
138 static VacPage copy_vac_page(VacPage vacpage);
139 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
140 static void *vac_bsearch(const void *key, const void *base,
141 size_t nelem, size_t size,
142 int (*compar) (const void *, const void *));
143 static int vac_cmp_blk(const void *left, const void *right);
144 static int vac_cmp_offno(const void *left, const void *right);
145 static int vac_cmp_vtlinks(const void *left, const void *right);
146 static bool enough_space(VacPage vacpage, Size len);
149 /****************************************************************************
151 * Code common to all flavors of VACUUM and ANALYZE *
153 ****************************************************************************
158 * Primary entry point for VACUUM and ANALYZE commands.
161 vacuum(VacuumStmt *vacstmt)
163 const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
167 if (vacstmt->verbose)
173 * We cannot run VACUUM inside a user transaction block; if we were
174 * inside a transaction, then our commit- and
175 * start-transaction-command calls would not have the intended effect!
176 * Furthermore, the forced commit that occurs before truncating the
177 * relation's file would have the effect of committing the rest of the
178 * user's transaction too, which would certainly not be the desired
181 if (IsTransactionBlock())
182 elog(ERROR, "%s cannot run inside a BEGIN/END block", stmttype);
185 * Send info about dead objects to the statistics collector
187 pgstat_vacuum_tabstat();
190 * Create special memory context for cross-transaction storage.
192 * Since it is a child of QueryContext, it will go away eventually even
193 * if we suffer an error; there's no need for special abort cleanup
196 vac_context = AllocSetContextCreate(QueryContext,
198 ALLOCSET_DEFAULT_MINSIZE,
199 ALLOCSET_DEFAULT_INITSIZE,
200 ALLOCSET_DEFAULT_MAXSIZE);
202 /* Build list of relations to process (note this lives in vac_context) */
203 vrl = getrels(vacstmt->relation, stmttype);
206 * Start up the vacuum cleaner.
208 vacuum_init(vacstmt);
211 * Process each selected relation. We are careful to process each
212 * relation in a separate transaction in order to avoid holding too
213 * many locks at one time. Also, if we are doing VACUUM ANALYZE, the
214 * ANALYZE part runs as a separate transaction from the VACUUM to
215 * further reduce locking.
219 Oid relid = (Oid) lfirsti(cur);
222 vacuum_rel(relid, vacstmt, RELKIND_RELATION);
223 if (vacstmt->analyze)
224 analyze_rel(relid, vacstmt);
228 vacuum_shutdown(vacstmt);
232 * vacuum_init(), vacuum_shutdown() -- start up and shut down the vacuum cleaner.
234 * Formerly, there was code here to prevent more than one VACUUM from
235 * executing concurrently in the same database. However, there's no
236 * good reason to prevent that, and manually removing lockfiles after
237 * a vacuum crash was a pain for dbadmins. So, forget about lockfiles,
238 * and just rely on the locks we grab on each target table
239 * to ensure that there aren't two VACUUMs running on the same table
242 * The strangeness with committing and starting transactions in the
243 * init and shutdown routines is due to the fact that the vacuum cleaner
244 * is invoked via an SQL command, and so is already executing inside
245 * a transaction. We need to leave ourselves in a predictable state
246 * on entry and exit to the vacuum cleaner. We commit the transaction
247 * started in PostgresMain() inside vacuum_init(), and start one in
248 * vacuum_shutdown() to match the commit waiting for us back in
252 vacuum_init(VacuumStmt *vacstmt)
254 if (vacstmt->vacuum && vacstmt->relation == NULL)
257 * Compute the initially applicable OldestXmin and FreezeLimit
258 * XIDs, so that we can record these values at the end of the
259 * VACUUM. Note that individual tables may well be processed with
260 * newer values, but we can guarantee that no (non-shared)
261 * relations are processed with older ones.
263 * It is okay to record non-shared values in pg_database, even though
264 * we may vacuum shared relations with older cutoffs, because only
265 * the minimum of the values present in pg_database matters. We
266 * can be sure that shared relations have at some time been
267 * vacuumed with cutoffs no worse than the global minimum; for, if
268 * there is a backend in some other DB with xmin = OLDXMIN that's
269 * determining the cutoff with which we vacuum shared relations,
270 * it is not possible for that database to have a cutoff newer
271 * than OLDXMIN recorded in pg_database.
273 vacuum_set_xid_limits(vacstmt, false,
274 &initialOldestXmin, &initialFreezeLimit);
277 /* matches the StartTransaction in PostgresMain() */
278 CommitTransactionCommand();
282 vacuum_shutdown(VacuumStmt *vacstmt)
284 /* on entry, we are not in a transaction */
286 /* matches the CommitTransaction in PostgresMain() */
287 StartTransactionCommand();
290 * If we did a database-wide VACUUM, update the database's pg_database
291 * row with info about the transaction IDs used, and try to truncate
294 if (vacstmt->vacuum && vacstmt->relation == NULL)
296 vac_update_dbstats(MyDatabaseId,
297 initialOldestXmin, initialFreezeLimit);
298 vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
302 * Clean up working storage --- note we must do this after
303 * StartTransactionCommand, else we might be trying to delete the
306 MemoryContextDelete(vac_context);
311 * Build a list of Oids for each relation to be processed
313 * The list is built in vac_context so that it will survive across our
314 * per-relation transactions.
317 getrels(const RangeVar *vacrel, const char *stmttype)
320 MemoryContext oldcontext;
324 /* Process specific relation */
327 relid = RangeVarGetRelid(vacrel, false);
329 /* Make a relation list entry for this guy */
330 oldcontext = MemoryContextSwitchTo(vac_context);
331 vrl = lappendi(vrl, relid);
332 MemoryContextSwitchTo(oldcontext);
336 /* Process all plain relations listed in pg_class */
342 ScanKeyEntryInitialize(&key, 0x0,
343 Anum_pg_class_relkind,
345 CharGetDatum(RELKIND_RELATION));
347 pgclass = heap_openr(RelationRelationName, AccessShareLock);
349 scan = heap_beginscan(pgclass, false, SnapshotNow, 1, &key);
351 while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
353 /* Make a relation list entry for this guy */
354 oldcontext = MemoryContextSwitchTo(vac_context);
355 vrl = lappendi(vrl, tuple->t_data->t_oid);
356 MemoryContextSwitchTo(oldcontext);
360 heap_close(pgclass, AccessShareLock);
367 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
370 vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
371 TransactionId *oldestXmin,
372 TransactionId *freezeLimit)
376 *oldestXmin = GetOldestXmin(sharedRel);
378 Assert(TransactionIdIsNormal(*oldestXmin));
382 /* FREEZE option: use oldest Xmin as freeze cutoff too */
388 * Normal case: freeze cutoff is well in the past, to wit, about
389 * halfway to the wrap horizon
391 limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
395 * Be careful not to generate a "permanent" XID
397 if (!TransactionIdIsNormal(limit))
398 limit = FirstNormalTransactionId;
401 * Ensure sane relationship of limits
403 if (TransactionIdFollows(limit, *oldestXmin))
405 elog(WARNING, "oldest Xmin is far in the past --- close open transactions soon to avoid wraparound problems");
409 *freezeLimit = limit;
414 * vac_update_relstats() -- update statistics for one relation
416 * Update the whole-relation statistics that are kept in its pg_class
417 * row. There are additional stats that will be updated if we are
418 * doing ANALYZE, but we always update these stats. This routine works
419 * for both index and heap relation entries in pg_class.
421 * We violate no-overwrite semantics here by storing new values for the
422 * statistics columns directly into the pg_class tuple that's already on
423 * the page. The reason for this is that if we updated these tuples in
424 * the usual way, vacuuming pg_class itself wouldn't work very well ---
425 * by the time we got done with a vacuum cycle, most of the tuples in
426 * pg_class would've been obsoleted. Of course, this only works for
427 * fixed-size never-null columns, but these are.
429 * This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
433 vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
439 Form_pg_class pgcform;
443 * update number of tuples and number of pages in pg_class
445 rd = heap_openr(RelationRelationName, RowExclusiveLock);
447 ctup = SearchSysCache(RELOID,
448 ObjectIdGetDatum(relid),
450 if (!HeapTupleIsValid(ctup))
451 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
454 /* get the buffer cache tuple */
455 rtup.t_self = ctup->t_self;
456 ReleaseSysCache(ctup);
457 heap_fetch(rd, SnapshotNow, &rtup, &buffer, NULL);
459 /* overwrite the existing statistics in the tuple */
460 pgcform = (Form_pg_class) GETSTRUCT(&rtup);
461 pgcform->relpages = (int32) num_pages;
462 pgcform->reltuples = num_tuples;
463 pgcform->relhasindex = hasindex;
466 * If we have discovered that there are no indexes, then there's no
467 * primary key either. This could be done more thoroughly...
470 pgcform->relhaspkey = false;
473 * Invalidate the tuple in the catcaches; this also arranges to flush
474 * the relation's relcache entry. (If we fail to commit for some reason,
475 * no flush will occur, but no great harm is done since there are no
476 * noncritical state updates here.)
478 CacheInvalidateHeapTuple(rd, &rtup);
480 /* Write the buffer */
483 heap_close(rd, RowExclusiveLock);
488 * vac_update_dbstats() -- update statistics for one database
490 * Update the whole-database statistics that are kept in its pg_database
493 * We violate no-overwrite semantics here by storing new values for the
494 * statistics columns directly into the tuple that's already on the page.
495 * As with vac_update_relstats, this avoids leaving dead tuples behind
496 * after a VACUUM; which is good since GetRawDatabaseInfo
497 * can get confused by finding dead tuples in pg_database.
499 * This routine is shared by full and lazy VACUUM. Note that it is only
500 * applied after a database-wide VACUUM operation.
503 vac_update_dbstats(Oid dbid,
504 TransactionId vacuumXID,
505 TransactionId frozenXID)
508 ScanKeyData entry[1];
511 Form_pg_database dbform;
513 relation = heap_openr(DatabaseRelationName, RowExclusiveLock);
515 /* Must use a heap scan, since there's no syscache for pg_database */
516 ScanKeyEntryInitialize(&entry[0], 0x0,
517 ObjectIdAttributeNumber, F_OIDEQ,
518 ObjectIdGetDatum(dbid));
520 scan = heap_beginscan(relation, 0, SnapshotNow, 1, entry);
522 tuple = heap_getnext(scan, 0);
524 if (!HeapTupleIsValid(tuple))
525 elog(ERROR, "database %u does not exist", dbid);
527 dbform = (Form_pg_database) GETSTRUCT(tuple);
529 /* overwrite the existing statistics in the tuple */
530 dbform->datvacuumxid = vacuumXID;
531 dbform->datfrozenxid = frozenXID;
533 /* invalidate the tuple in the cache and write the buffer */
534 CacheInvalidateHeapTuple(relation, tuple);
535 WriteNoReleaseBuffer(scan->rs_cbuf);
539 heap_close(relation, RowExclusiveLock);
544 * vac_truncate_clog() -- attempt to truncate the commit log
546 * Scan pg_database to determine the system-wide oldest datvacuumxid,
547 * and use it to truncate the transaction commit log (pg_clog).
548 * Also generate a warning if the system-wide oldest datfrozenxid
549 * seems to be in danger of wrapping around.
551 * The passed XIDs are simply the ones I just wrote into my pg_database
552 * entry. They're used to initialize the "min" calculations.
554 * This routine is shared by full and lazy VACUUM. Note that it is only
555 * applied after a database-wide VACUUM operation.
558 vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
565 relation = heap_openr(DatabaseRelationName, AccessShareLock);
567 scan = heap_beginscan(relation, 0, SnapshotNow, 0, NULL);
569 while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
571 Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
573 /* Ignore non-connectable databases (eg, template0) */
574 /* It's assumed that these have been frozen correctly */
575 if (!dbform->datallowconn)
578 if (TransactionIdIsNormal(dbform->datvacuumxid) &&
579 TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
580 vacuumXID = dbform->datvacuumxid;
581 if (TransactionIdIsNormal(dbform->datfrozenxid) &&
582 TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
583 frozenXID = dbform->datfrozenxid;
588 heap_close(relation, AccessShareLock);
590 /* Truncate CLOG to the oldest vacuumxid */
591 TruncateCLOG(vacuumXID);
593 /* Give warning about impending wraparound problems */
594 age = (int32) (GetCurrentTransactionId() - frozenXID);
595 if (age > (int32) ((MaxTransactionId >> 3) * 3))
596 elog(WARNING, "Some databases have not been vacuumed in %d transactions."
597 "\n\tBetter vacuum them within %d transactions,"
598 "\n\tor you may have a wraparound failure.",
599 age, (int32) (MaxTransactionId >> 1) - age);
603 /****************************************************************************
605 * Code common to both flavors of VACUUM *
607 ****************************************************************************
612 * vacuum_rel() -- vacuum one heap relation
614 * Doing one heap at a time incurs extra overhead, since we need to
615 * check that the heap exists again just before we vacuum it. The
616 * reason that we do this is so that vacuuming can be spread across
617 * many small transactions. Otherwise, two-phase locking would require
618 * us to lock the entire database during one pass of the vacuum cleaner.
620 * At entry and exit, we are not inside a transaction.
623 vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
630 /* Begin a transaction for vacuuming this relation */
631 StartTransactionCommand();
634 * Check for user-requested abort. Note we want this to be inside a
635 * transaction, so xact.c doesn't issue useless WARNING.
637 CHECK_FOR_INTERRUPTS();
640 * Race condition -- if the pg_class tuple has gone away since the
641 * last time we saw it, we don't need to vacuum it.
643 if (!SearchSysCacheExists(RELOID,
644 ObjectIdGetDatum(relid),
647 CommitTransactionCommand();
652 * Determine the type of lock we want --- hard exclusive lock for a
653 * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
654 * vacuum. Either way, we can be sure that no other backend is
655 * vacuuming the same table.
657 lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;
660 * Open the class, get an appropriate lock on it, and check
663 * We allow the user to vacuum a table if he is superuser, the table
664 * owner, or the database owner (but in the latter case, only if it's
665 * not a shared relation). pg_class_ownercheck includes the superuser case.
667 * Note we choose to treat permissions failure as a WARNING and keep
668 * trying to vacuum the rest of the DB --- is this appropriate?
670 onerel = relation_open(relid, lmode);
672 if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
673 (is_dbadmin(MyDatabaseId) && !onerel->rd_rel->relisshared)))
675 elog(WARNING, "Skipping \"%s\" --- only table or database owner can VACUUM it",
676 RelationGetRelationName(onerel));
677 relation_close(onerel, lmode);
678 CommitTransactionCommand();
683 * Check that it's a plain table; we used to do this in getrels() but
684 * seems safer to check after we've locked the relation.
686 if (onerel->rd_rel->relkind != expected_relkind)
688 elog(WARNING, "Skipping \"%s\" --- can not process indexes, views or special system tables",
689 RelationGetRelationName(onerel));
690 relation_close(onerel, lmode);
691 CommitTransactionCommand();
696 * Get a session-level lock too. This will protect our access to the
697 * relation across multiple transactions, so that we can vacuum the
698 * relation's TOAST table (if any) secure in the knowledge that no one
699 * is deleting the parent relation.
701 * NOTE: this cannot block, even if someone else is waiting for access,
702 * because the lock manager knows that both lock requests are from the
705 onerelid = onerel->rd_lockInfo.lockRelId;
706 LockRelationForSession(&onerelid, lmode);
709 * Remember the relation's TOAST relation for later
711 toast_relid = onerel->rd_rel->reltoastrelid;
714 * Do the actual work --- either FULL or "lazy" vacuum
717 full_vacuum_rel(onerel, vacstmt);
719 lazy_vacuum_rel(onerel, vacstmt);
721 /* all done with this class, but hold lock until commit */
722 relation_close(onerel, NoLock);
725 * Complete the transaction and free all temporary memory used.
727 CommitTransactionCommand();
730 * If the relation has a secondary toast rel, vacuum that too while we
731 * still hold the session lock on the master table. Note however that
732 * "analyze" will not get done on the toast table. This is good,
733 * because the toaster always uses hardcoded index access and
734 * statistics are totally unimportant for toast relations.
736 if (toast_relid != InvalidOid)
737 vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE);
740 * Now release the session-level lock on the master table.
742 UnlockRelationForSession(&onerelid, lmode);
746 /****************************************************************************
748 * Code for VACUUM FULL (only) *
750 ****************************************************************************
755 * full_vacuum_rel() -- perform FULL VACUUM for one heap relation
757 * This routine vacuums a single heap, cleans out its indexes, and
758 * updates its num_pages and num_tuples statistics.
760 * At entry, we have already established a transaction and opened
761 * and locked the relation.
764 full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
766 VacPageListData vacuum_pages; /* List of pages to vacuum and/or
768 VacPageListData fraged_pages; /* List of pages with space enough
773 VRelStats *vacrelstats;
774 bool reindex = false;
776 if (IsIgnoringSystemIndexes() &&
777 IsSystemRelationName(RelationGetRelationName(onerel)))
780 vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
781 &OldestXmin, &FreezeLimit);
784 * Set up statistics-gathering machinery.
786 vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
787 vacrelstats->rel_pages = 0;
788 vacrelstats->rel_tuples = 0;
789 vacrelstats->hasindex = false;
792 vacuum_pages.num_pages = fraged_pages.num_pages = 0;
793 scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
795 /* Now open all indexes of the relation */
796 vac_open_indexes(onerel, &nindexes, &Irel);
799 else if (!RelationGetForm(onerel)->relhasindex)
802 vacrelstats->hasindex = true;
807 * reindex in VACUUM is dangerous under WAL. ifdef out until it
812 vac_close_indexes(nindexes, Irel);
813 Irel = (Relation *) NULL;
814 activate_indexes_of_a_table(RelationGetRelid(onerel), false);
816 #endif /* NOT_USED */
818 /* Clean/scan index relation(s) */
819 if (Irel != (Relation *) NULL)
821 if (vacuum_pages.num_pages > 0)
823 for (i = 0; i < nindexes; i++)
824 vacuum_index(&vacuum_pages, Irel[i],
825 vacrelstats->rel_tuples, 0);
829 /* just scan indexes to update statistic */
830 for (i = 0; i < nindexes; i++)
831 scan_index(Irel[i], vacrelstats->rel_tuples);
835 if (fraged_pages.num_pages > 0)
837 /* Try to shrink heap */
838 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
840 vac_close_indexes(nindexes, Irel);
844 vac_close_indexes(nindexes, Irel);
845 if (vacuum_pages.num_pages > 0)
847 /* Clean pages from vacuum_pages list */
848 vacuum_heap(vacrelstats, onerel, &vacuum_pages);
853 * Flush dirty pages out to disk. We must do this even if we
854 * didn't do anything else, because we want to ensure that all
855 * tuples have correct on-row commit status on disk (see
856 * bufmgr.c's comments for FlushRelationBuffers()).
858 i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
860 elog(ERROR, "VACUUM (full_vacuum_rel): FlushRelationBuffers returned %d",
867 activate_indexes_of_a_table(RelationGetRelid(onerel), true);
868 #endif /* NOT_USED */
870 /* update shared free space map with final free space info */
871 vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
873 /* update statistics in pg_class */
874 vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
875 vacrelstats->rel_tuples, vacrelstats->hasindex);
880 * scan_heap() -- scan an open heap relation
882 * This routine sets commit status bits, constructs vacuum_pages (list
883 * of pages we need to compact free space on and/or clean indexes of
884 * deleted tuples), constructs fraged_pages (list of pages with free
885 * space that tuples could be moved into), and calculates statistics
886 * on the number of live tuples in the heap.
889 scan_heap(VRelStats *vacrelstats, Relation onerel,
890 VacPageList vacuum_pages, VacPageList fraged_pages)
905 BlockNumber empty_pages,
915 Size min_tlen = MaxTupleSize;
918 bool do_shrinking = true;
919 VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
921 int free_vtlinks = 100;
924 vac_init_rusage(&ru0);
926 relname = RelationGetRelationName(onerel);
927 elog(elevel, "--Relation %s.%s--",
928 get_namespace_name(RelationGetNamespace(onerel)),
931 empty_pages = new_pages = changed_pages = empty_end_pages = 0;
932 num_tuples = tups_vacuumed = nkeep = nunused = 0;
935 nblocks = RelationGetNumberOfBlocks(onerel);
938 * We initially create each VacPage item in a maximal-sized workspace,
939 * then copy the workspace into a just-large-enough copy.
941 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
943 for (blkno = 0; blkno < nblocks; blkno++)
950 CHECK_FOR_INTERRUPTS();
952 buf = ReadBuffer(onerel, blkno);
953 page = BufferGetPage(buf);
955 vacpage->blkno = blkno;
956 vacpage->offsets_used = 0;
957 vacpage->offsets_free = 0;
961 elog(WARNING, "Rel %s: Uninitialized page %u - fixing",
963 PageInit(page, BufferGetPageSize(buf), 0);
964 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
965 free_size += (vacpage->free - sizeof(ItemIdData));
968 vacpagecopy = copy_vac_page(vacpage);
969 vpage_insert(vacuum_pages, vacpagecopy);
970 vpage_insert(fraged_pages, vacpagecopy);
975 if (PageIsEmpty(page))
977 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
978 free_size += (vacpage->free - sizeof(ItemIdData));
981 vacpagecopy = copy_vac_page(vacpage);
982 vpage_insert(vacuum_pages, vacpagecopy);
983 vpage_insert(fraged_pages, vacpagecopy);
990 maxoff = PageGetMaxOffsetNumber(page);
991 for (offnum = FirstOffsetNumber;
993 offnum = OffsetNumberNext(offnum))
997 itemid = PageGetItemId(page, offnum);
1000 * Collect un-used items too - it's possible to have indexes
1001 * pointing here after crash.
1003 if (!ItemIdIsUsed(itemid))
1005 vacpage->offsets[vacpage->offsets_free++] = offnum;
1010 tuple.t_datamcxt = NULL;
1011 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1012 tuple.t_len = ItemIdGetLength(itemid);
1013 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1016 sv_infomask = tuple.t_data->t_infomask;
1018 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin))
1020 case HEAPTUPLE_DEAD:
1021 tupgone = true; /* we can delete the tuple */
1023 case HEAPTUPLE_LIVE:
1026 * Tuple is good. Consider whether to replace its
1027 * xmin value with FrozenTransactionId.
1029 if (TransactionIdIsNormal(tuple.t_data->t_xmin) &&
1030 TransactionIdPrecedes(tuple.t_data->t_xmin,
1033 tuple.t_data->t_xmin = FrozenTransactionId;
1034 /* infomask should be okay already */
1035 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1039 case HEAPTUPLE_RECENTLY_DEAD:
1042 * If tuple is recently deleted then we must not
1043 * remove it from relation.
1048 * If we do shrinking and this tuple is updated one
1049 * then remember it to construct updated tuple
1053 !(ItemPointerEquals(&(tuple.t_self),
1054 &(tuple.t_data->t_ctid))))
1056 if (free_vtlinks == 0)
1058 free_vtlinks = 1000;
1059 vtlinks = (VTupleLink) repalloc(vtlinks,
1060 (free_vtlinks + num_vtlinks) *
1061 sizeof(VTupleLinkData));
1063 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
1064 vtlinks[num_vtlinks].this_tid = tuple.t_self;
1069 case HEAPTUPLE_INSERT_IN_PROGRESS:
1072 * This should not happen, since we hold exclusive
1073 * lock on the relation; shouldn't we raise an error?
1075 elog(WARNING, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
1076 relname, blkno, offnum, tuple.t_data->t_xmin);
1077 do_shrinking = false;
1079 case HEAPTUPLE_DELETE_IN_PROGRESS:
1082 * This should not happen, since we hold exclusive
1083 * lock on the relation; shouldn't we raise an error?
1085 elog(WARNING, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
1086 relname, blkno, offnum, tuple.t_data->t_xmax);
1087 do_shrinking = false;
1090 elog(ERROR, "Unexpected HeapTupleSatisfiesVacuum result");
1094 /* check for hint-bit update by HeapTupleSatisfiesVacuum */
1095 if (sv_infomask != tuple.t_data->t_infomask)
1101 if (!OidIsValid(tuple.t_data->t_oid) &&
1102 onerel->rd_rel->relhasoids)
1103 elog(WARNING, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
1104 relname, blkno, offnum, (int) tupgone);
1111 * Here we are building a temporary copy of the page with
1112 * dead tuples removed. Below we will apply
1113 * PageRepairFragmentation to the copy, so that we can
1114 * determine how much space will be available after
1115 * removal of dead tuples. But note we are NOT changing
1116 * the real page yet...
1118 if (tempPage == (Page) NULL)
1122 pageSize = PageGetPageSize(page);
1123 tempPage = (Page) palloc(pageSize);
1124 memcpy(tempPage, page, pageSize);
1127 /* mark it unused on the temp page */
1128 lpp = PageGetItemId(tempPage, offnum);
1129 lpp->lp_flags &= ~LP_USED;
1131 vacpage->offsets[vacpage->offsets_free++] = offnum;
1138 if (tuple.t_len < min_tlen)
1139 min_tlen = tuple.t_len;
1140 if (tuple.t_len > max_tlen)
1141 max_tlen = tuple.t_len;
1143 } /* scan along page */
1145 if (tempPage != (Page) NULL)
1147 /* Some tuples are removable; figure free space after removal */
1148 PageRepairFragmentation(tempPage, NULL);
1149 vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1155 /* Just use current available space */
1156 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1157 /* Need to reap the page if it has ~LP_USED line pointers */
1158 do_reap = (vacpage->offsets_free > 0);
1161 free_size += vacpage->free;
1164 * Add the page to fraged_pages if it has a useful amount of free
1165 * space. "Useful" means enough for a minimal-sized tuple. But we
1166 * don't know that accurately near the start of the relation, so
1167 * add pages unconditionally if they have >= BLCKSZ/10 free space.
1169 do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1171 if (do_reap || do_frag)
1173 vacpagecopy = copy_vac_page(vacpage);
1175 vpage_insert(vacuum_pages, vacpagecopy);
1177 vpage_insert(fraged_pages, vacpagecopy);
1183 empty_end_pages = 0;
1196 /* save stats in the rel list for use later */
1197 vacrelstats->rel_tuples = num_tuples;
1198 vacrelstats->rel_pages = nblocks;
1199 if (num_tuples == 0)
1200 min_tlen = max_tlen = 0;
1201 vacrelstats->min_tlen = min_tlen;
1202 vacrelstats->max_tlen = max_tlen;
1204 vacuum_pages->empty_end_pages = empty_end_pages;
1205 fraged_pages->empty_end_pages = empty_end_pages;
1208 * Clear the fraged_pages list if we found we couldn't shrink. Else,
1209 * remove any "empty" end-pages from the list, and compute usable free
1210 * space = free space in remaining pages.
1214 Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
1215 fraged_pages->num_pages -= empty_end_pages;
1216 usable_free_size = 0;
1217 for (i = 0; i < fraged_pages->num_pages; i++)
1218 usable_free_size += fraged_pages->pagedesc[i]->free;
1222 fraged_pages->num_pages = 0;
1223 usable_free_size = 0;
1226 if (usable_free_size > 0 && num_vtlinks > 0)
1228 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
1230 vacrelstats->vtlinks = vtlinks;
1231 vacrelstats->num_vtlinks = num_vtlinks;
1235 vacrelstats->vtlinks = NULL;
1236 vacrelstats->num_vtlinks = 0;
1240 elog(elevel, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
1241 Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, UnUsed %.0f, MinLen %lu, MaxLen %lu; \
1242 Re-using: Free/Avail. Space %.0f/%.0f; EndEmpty/Avail. Pages %u/%u.\n\t%s",
1243 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
1244 new_pages, num_tuples, tups_vacuumed,
1245 nkeep, vacrelstats->num_vtlinks,
1246 nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1247 free_size, usable_free_size,
1248 empty_end_pages, fraged_pages->num_pages,
1249 vac_show_rusage(&ru0));
1255 * repair_frag() -- try to repair relation's fragmentation
1257 * This routine marks dead tuples as unused and tries re-use dead space
1258 * by moving tuples (and inserting indexes if needed). It constructs
1259 * Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1260 * for them after committing (in hack-manner - without losing locks
1261 * and freeing memory!) current transaction. It truncates relation
1262 * if some end-blocks are gone away.
1265 repair_frag(VRelStats *vacrelstats, Relation onerel,
1266 VacPageList vacuum_pages, VacPageList fraged_pages,
1267 int nindexes, Relation *Irel)
1269 TransactionId myXID;
1273 BlockNumber nblocks,
1275 BlockNumber last_move_dest_block = 0,
1279 OffsetNumber offnum,
1285 HeapTupleData tuple,
1288 ResultRelInfo *resultRelInfo;
1290 TupleTable tupleTable;
1291 TupleTableSlot *slot;
1292 VacPageListData Nvacpagelist;
1293 VacPage cur_page = NULL,
1311 vac_init_rusage(&ru0);
1313 myXID = GetCurrentTransactionId();
1314 myCID = GetCurrentCommandId();
1316 tupdesc = RelationGetDescr(onerel);
1319 * We need a ResultRelInfo and an EState so we can use the regular
1320 * executor's index-entry-making machinery.
1322 resultRelInfo = makeNode(ResultRelInfo);
1323 resultRelInfo->ri_RangeTableIndex = 1; /* dummy */
1324 resultRelInfo->ri_RelationDesc = onerel;
1325 resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */
1327 ExecOpenIndices(resultRelInfo);
1329 estate = CreateExecutorState();
1330 estate->es_result_relations = resultRelInfo;
1331 estate->es_num_result_relations = 1;
1332 estate->es_result_relation_info = resultRelInfo;
1334 /* Set up a dummy tuple table too */
1335 tupleTable = ExecCreateTupleTable(1);
1336 slot = ExecAllocTableSlot(tupleTable);
1337 ExecSetSlotDescriptor(slot, tupdesc, false);
1339 Nvacpagelist.num_pages = 0;
1340 num_fraged_pages = fraged_pages->num_pages;
1341 Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
1342 vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1343 if (vacuumed_pages > 0)
1345 /* get last reaped page from vacuum_pages */
1346 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1347 last_vacuum_block = last_vacuum_page->blkno;
1351 last_vacuum_page = NULL;
1352 last_vacuum_block = InvalidBlockNumber;
1354 cur_buffer = InvalidBuffer;
1357 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1358 vacpage->offsets_used = vacpage->offsets_free = 0;
1361 * Scan pages backwards from the last nonempty page, trying to move
1362 * tuples down to lower pages. Quit when we reach a page that we have
1363 * moved any tuples onto, or the first page if we haven't moved
1364 * anything, or when we find a page we cannot completely empty (this
1365 * last condition is handled by "break" statements within the loop).
1367 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1368 * in order by blkno.
1370 nblocks = vacrelstats->rel_pages;
1371 for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1372 blkno > last_move_dest_block;
1375 CHECK_FOR_INTERRUPTS();
1378 * Forget fraged_pages pages at or after this one; they're no
1379 * longer useful as move targets, since we only want to move down.
1380 * Note that since we stop the outer loop at last_move_dest_block,
1381 * pages removed here cannot have had anything moved onto them
1384 * Also note that we don't change the stored fraged_pages list, only
1385 * our local variable num_fraged_pages; so the forgotten pages are
1386 * still available to be loaded into the free space map later.
1388 while (num_fraged_pages > 0 &&
1389 fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1391 Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1396 * Process this page of relation.
1398 buf = ReadBuffer(onerel, blkno);
1399 page = BufferGetPage(buf);
1401 vacpage->offsets_free = 0;
1403 isempty = PageIsEmpty(page);
1407 /* Is the page in the vacuum_pages list? */
1408 if (blkno == last_vacuum_block)
1410 if (last_vacuum_page->offsets_free > 0)
1412 /* there are dead tuples on this page - clean them */
1414 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1415 vacuum_page(onerel, buf, last_vacuum_page);
1416 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1422 if (vacuumed_pages > 0)
1424 /* get prev reaped page from vacuum_pages */
1425 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1426 last_vacuum_block = last_vacuum_page->blkno;
1430 last_vacuum_page = NULL;
1431 last_vacuum_block = InvalidBlockNumber;
1442 chain_tuple_moved = false; /* no one chain-tuple was moved
1443 * off this page, yet */
1444 vacpage->blkno = blkno;
1445 maxoff = PageGetMaxOffsetNumber(page);
1446 for (offnum = FirstOffsetNumber;
1448 offnum = OffsetNumberNext(offnum))
1450 itemid = PageGetItemId(page, offnum);
1452 if (!ItemIdIsUsed(itemid))
1455 tuple.t_datamcxt = NULL;
1456 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1457 tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1458 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1460 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1462 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1463 elog(ERROR, "Invalid XID in t_cmin");
1464 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1465 elog(ERROR, "HEAP_MOVED_IN was not expected");
1468 * If this (chain) tuple is moved by me already then I
1469 * have to check is it in vacpage or not - i.e. is it
1470 * moved while cleaning this page or some previous one.
1472 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1474 if (keep_tuples == 0)
1476 if (chain_tuple_moved) /* some chains was moved
1478 { /* cleaning this page */
1479 Assert(vacpage->offsets_free > 0);
1480 for (i = 0; i < vacpage->offsets_free; i++)
1482 if (vacpage->offsets[i] == offnum)
1485 if (i >= vacpage->offsets_free) /* not found */
1487 vacpage->offsets[vacpage->offsets_free++] = offnum;
1493 vacpage->offsets[vacpage->offsets_free++] = offnum;
1498 elog(ERROR, "HEAP_MOVED_OFF was expected");
1502 * If this tuple is in the chain of tuples created in updates
1503 * by "recent" transactions then we have to move all chain of
1504 * tuples to another places.
1506 if ((tuple.t_data->t_infomask & HEAP_UPDATED &&
1507 !TransactionIdPrecedes(tuple.t_data->t_xmin, OldestXmin)) ||
1508 (!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1509 !(ItemPointerEquals(&(tuple.t_self),
1510 &(tuple.t_data->t_ctid)))))
1515 ItemPointerData Ctid;
1516 HeapTupleData tp = tuple;
1517 Size tlen = tuple_len;
1518 VTupleMove vtmove = (VTupleMove)
1519 palloc(100 * sizeof(VTupleMoveData));
1521 int free_vtmove = 100;
1522 VacPage to_vacpage = NULL;
1524 bool freeCbuf = false;
1527 if (vacrelstats->vtlinks == NULL)
1528 elog(ERROR, "No one parent tuple was found");
1529 if (cur_buffer != InvalidBuffer)
1531 WriteBuffer(cur_buffer);
1532 cur_buffer = InvalidBuffer;
1536 * If this tuple is in the begin/middle of the chain then
1537 * we have to move to the end of chain.
1539 while (!(tp.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1540 !(ItemPointerEquals(&(tp.t_self),
1541 &(tp.t_data->t_ctid))))
1543 Ctid = tp.t_data->t_ctid;
1545 ReleaseBuffer(Cbuf);
1547 Cbuf = ReadBuffer(onerel,
1548 ItemPointerGetBlockNumber(&Ctid));
1549 Cpage = BufferGetPage(Cbuf);
1550 Citemid = PageGetItemId(Cpage,
1551 ItemPointerGetOffsetNumber(&Ctid));
1552 if (!ItemIdIsUsed(Citemid))
1555 * This means that in the middle of chain there
1556 * was tuple updated by older (than OldestXmin)
1557 * xaction and this tuple is already deleted by
1558 * me. Actually, upper part of chain should be
1559 * removed and seems that this should be handled
1560 * in scan_heap(), but it's not implemented at the
1561 * moment and so we just stop shrinking here.
1563 ReleaseBuffer(Cbuf);
1566 elog(WARNING, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1569 tp.t_datamcxt = NULL;
1570 tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1572 tlen = tp.t_len = ItemIdGetLength(Citemid);
1576 /* first, can chain be moved ? */
1579 if (to_vacpage == NULL ||
1580 !enough_space(to_vacpage, tlen))
1582 for (i = 0; i < num_fraged_pages; i++)
1584 if (enough_space(fraged_pages->pagedesc[i], tlen))
1588 if (i == num_fraged_pages)
1590 /* can't move item anywhere */
1591 for (i = 0; i < num_vtmove; i++)
1593 Assert(vtmove[i].vacpage->offsets_used > 0);
1594 (vtmove[i].vacpage->offsets_used)--;
1600 to_vacpage = fraged_pages->pagedesc[to_item];
1602 to_vacpage->free -= MAXALIGN(tlen);
1603 if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1604 to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
1605 (to_vacpage->offsets_used)++;
1606 if (free_vtmove == 0)
1609 vtmove = (VTupleMove) repalloc(vtmove,
1610 (free_vtmove + num_vtmove) *
1611 sizeof(VTupleMoveData));
1613 vtmove[num_vtmove].tid = tp.t_self;
1614 vtmove[num_vtmove].vacpage = to_vacpage;
1615 if (to_vacpage->offsets_used == 1)
1616 vtmove[num_vtmove].cleanVpd = true;
1618 vtmove[num_vtmove].cleanVpd = false;
1623 if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1624 TransactionIdPrecedes(tp.t_data->t_xmin, OldestXmin))
1627 /* Well, try to find tuple with old row version */
1634 VTupleLinkData vtld,
1637 vtld.new_tid = tp.t_self;
1639 vac_bsearch((void *) &vtld,
1640 (void *) (vacrelstats->vtlinks),
1641 vacrelstats->num_vtlinks,
1642 sizeof(VTupleLinkData),
1645 elog(ERROR, "Parent tuple was not found");
1646 tp.t_self = vtlp->this_tid;
1647 Pbuf = ReadBuffer(onerel,
1648 ItemPointerGetBlockNumber(&(tp.t_self)));
1649 Ppage = BufferGetPage(Pbuf);
1650 Pitemid = PageGetItemId(Ppage,
1651 ItemPointerGetOffsetNumber(&(tp.t_self)));
1652 if (!ItemIdIsUsed(Pitemid))
1653 elog(ERROR, "Parent itemid marked as unused");
1654 Ptp.t_datamcxt = NULL;
1655 Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1656 Assert(ItemPointerEquals(&(vtld.new_tid),
1657 &(Ptp.t_data->t_ctid)));
1660 * Read above about cases when
1661 * !ItemIdIsUsed(Citemid) (child item is
1662 * removed)... Due to the fact that at the moment
1663 * we don't remove unuseful part of update-chain,
1664 * it's possible to get too old parent row here.
1665 * Like as in the case which caused this problem,
1666 * we stop shrinking here. I could try to find
1667 * real parent row but want not to do it because
1668 * of real solution will be implemented anyway,
1669 * latter, and we are too close to 6.5 release. -
1672 if (!(TransactionIdEquals(Ptp.t_data->t_xmax,
1673 tp.t_data->t_xmin)))
1676 ReleaseBuffer(Cbuf);
1678 ReleaseBuffer(Pbuf);
1679 for (i = 0; i < num_vtmove; i++)
1681 Assert(vtmove[i].vacpage->offsets_used > 0);
1682 (vtmove[i].vacpage->offsets_used)--;
1685 elog(WARNING, "Too old parent tuple found - can't continue repair_frag");
1688 #ifdef NOT_USED /* I'm not sure that this will wotk
1692 * If this tuple is updated version of row and it
1693 * was created by the same transaction then no one
1694 * is interested in this tuple - mark it as
1697 if (Ptp.t_data->t_infomask & HEAP_UPDATED &&
1698 TransactionIdEquals(Ptp.t_data->t_xmin,
1699 Ptp.t_data->t_xmax))
1701 TransactionIdStore(myXID,
1702 (TransactionId *) &(Ptp.t_data->t_cmin));
1703 Ptp.t_data->t_infomask &=
1704 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1705 Ptp.t_data->t_infomask |= HEAP_MOVED_OFF;
1710 tp.t_datamcxt = Ptp.t_datamcxt;
1711 tp.t_data = Ptp.t_data;
1712 tlen = tp.t_len = ItemIdGetLength(Pitemid);
1714 ReleaseBuffer(Cbuf);
1719 if (num_vtmove == 0)
1723 ReleaseBuffer(Cbuf);
1724 if (num_vtmove == 0) /* chain can't be moved */
1729 ItemPointerSetInvalid(&Ctid);
1730 for (ti = 0; ti < num_vtmove; ti++)
1732 VacPage destvacpage = vtmove[ti].vacpage;
1734 /* Get page to move from */
1735 tuple.t_self = vtmove[ti].tid;
1736 Cbuf = ReadBuffer(onerel,
1737 ItemPointerGetBlockNumber(&(tuple.t_self)));
1739 /* Get page to move to */
1740 cur_buffer = ReadBuffer(onerel, destvacpage->blkno);
1742 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1743 if (cur_buffer != Cbuf)
1744 LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
1746 ToPage = BufferGetPage(cur_buffer);
1747 Cpage = BufferGetPage(Cbuf);
1749 Citemid = PageGetItemId(Cpage,
1750 ItemPointerGetOffsetNumber(&(tuple.t_self)));
1751 tuple.t_datamcxt = NULL;
1752 tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1753 tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1756 * make a copy of the source tuple, and then mark the
1757 * source tuple MOVED_OFF.
1759 heap_copytuple_with_tuple(&tuple, &newtup);
1762 * register invalidation of source tuple in catcaches.
1764 CacheInvalidateHeapTuple(onerel, &tuple);
1766 /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
1767 START_CRIT_SECTION();
1769 TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
1770 tuple.t_data->t_infomask &=
1771 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1772 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1775 * If this page was not used before - clean it.
1777 * NOTE: a nasty bug used to lurk here. It is possible
1778 * for the source and destination pages to be the same
1779 * (since this tuple-chain member can be on a page
1780 * lower than the one we're currently processing in
1781 * the outer loop). If that's true, then after
1782 * vacuum_page() the source tuple will have been
1783 * moved, and tuple.t_data will be pointing at
1784 * garbage. Therefore we must do everything that uses
1785 * tuple.t_data BEFORE this step!!
1787 * This path is different from the other callers of
1788 * vacuum_page, because we have already incremented
1789 * the vacpage's offsets_used field to account for the
1790 * tuple(s) we expect to move onto the page. Therefore
1791 * vacuum_page's check for offsets_used == 0 is wrong.
1792 * But since that's a good debugging check for all
1793 * other callers, we work around it here rather than
1796 if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1798 int sv_offsets_used = destvacpage->offsets_used;
1800 destvacpage->offsets_used = 0;
1801 vacuum_page(onerel, cur_buffer, destvacpage);
1802 destvacpage->offsets_used = sv_offsets_used;
1806 * Update the state of the copied tuple, and store it
1807 * on the destination page.
1809 TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
1810 newtup.t_data->t_infomask &=
1811 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1812 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1813 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1814 InvalidOffsetNumber, LP_USED);
1815 if (newoff == InvalidOffsetNumber)
1817 elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
1818 (unsigned long) tuple_len, destvacpage->blkno);
1820 newitemid = PageGetItemId(ToPage, newoff);
1821 pfree(newtup.t_data);
1822 newtup.t_datamcxt = NULL;
1823 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1824 ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
1828 log_heap_move(onerel, Cbuf, tuple.t_self,
1829 cur_buffer, &newtup);
1831 if (Cbuf != cur_buffer)
1833 PageSetLSN(Cpage, recptr);
1834 PageSetSUI(Cpage, ThisStartUpID);
1836 PageSetLSN(ToPage, recptr);
1837 PageSetSUI(ToPage, ThisStartUpID);
1841 if (destvacpage->blkno > last_move_dest_block)
1842 last_move_dest_block = destvacpage->blkno;
1845 * Set new tuple's t_ctid pointing to itself for last
1846 * tuple in chain, and to next tuple in chain
1849 if (!ItemPointerIsValid(&Ctid))
1850 newtup.t_data->t_ctid = newtup.t_self;
1852 newtup.t_data->t_ctid = Ctid;
1853 Ctid = newtup.t_self;
1858 * Remember that we moved tuple from the current page
1859 * (corresponding index tuple will be cleaned).
1862 vacpage->offsets[vacpage->offsets_free++] =
1863 ItemPointerGetOffsetNumber(&(tuple.t_self));
1867 LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
1868 if (cur_buffer != Cbuf)
1869 LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);
1871 /* Create index entries for the moved tuple */
1872 if (resultRelInfo->ri_NumIndices > 0)
1874 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
1875 ExecInsertIndexTuples(slot, &(newtup.t_self),
1879 WriteBuffer(cur_buffer);
1882 cur_buffer = InvalidBuffer;
1884 chain_tuple_moved = true;
1888 /* try to find new page for this tuple */
1889 if (cur_buffer == InvalidBuffer ||
1890 !enough_space(cur_page, tuple_len))
1892 if (cur_buffer != InvalidBuffer)
1894 WriteBuffer(cur_buffer);
1895 cur_buffer = InvalidBuffer;
1897 for (i = 0; i < num_fraged_pages; i++)
1899 if (enough_space(fraged_pages->pagedesc[i], tuple_len))
1902 if (i == num_fraged_pages)
1903 break; /* can't move item anywhere */
1905 cur_page = fraged_pages->pagedesc[cur_item];
1906 cur_buffer = ReadBuffer(onerel, cur_page->blkno);
1907 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1908 ToPage = BufferGetPage(cur_buffer);
1909 /* if this page was not used before - clean it */
1910 if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
1911 vacuum_page(onerel, cur_buffer, cur_page);
1914 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1916 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1919 heap_copytuple_with_tuple(&tuple, &newtup);
1922 * register invalidation of source tuple in catcaches.
1924 * (Note: we do not need to register the copied tuple,
1925 * because we are not changing the tuple contents and
1926 * so there cannot be any need to flush negative
1927 * catcache entries.)
1929 CacheInvalidateHeapTuple(onerel, &tuple);
1931 /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
1932 START_CRIT_SECTION();
1935 * Mark new tuple as moved_in by vacuum and store vacuum XID
1938 TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
1939 newtup.t_data->t_infomask &=
1940 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1941 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1943 /* add tuple to the page */
1944 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1945 InvalidOffsetNumber, LP_USED);
1946 if (newoff == InvalidOffsetNumber)
1948 elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
1949 (unsigned long) tuple_len,
1950 cur_page->blkno, (unsigned long) cur_page->free,
1951 cur_page->offsets_used, cur_page->offsets_free);
1953 newitemid = PageGetItemId(ToPage, newoff);
1954 pfree(newtup.t_data);
1955 newtup.t_datamcxt = NULL;
1956 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1957 ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
1958 newtup.t_self = newtup.t_data->t_ctid;
1961 * Mark old tuple as moved_off by vacuum and store vacuum XID
1964 TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
1965 tuple.t_data->t_infomask &=
1966 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1967 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1971 log_heap_move(onerel, buf, tuple.t_self,
1972 cur_buffer, &newtup);
1974 PageSetLSN(page, recptr);
1975 PageSetSUI(page, ThisStartUpID);
1976 PageSetLSN(ToPage, recptr);
1977 PageSetSUI(ToPage, ThisStartUpID);
1981 cur_page->offsets_used++;
1983 cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
1984 if (cur_page->blkno > last_move_dest_block)
1985 last_move_dest_block = cur_page->blkno;
1987 vacpage->offsets[vacpage->offsets_free++] = offnum;
1989 LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
1990 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1992 /* insert index' tuples if needed */
1993 if (resultRelInfo->ri_NumIndices > 0)
1995 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
1996 ExecInsertIndexTuples(slot, &(newtup.t_self), estate, true);
1998 } /* walk along page */
2000 if (offnum < maxoff && keep_tuples > 0)
2004 for (off = OffsetNumberNext(offnum);
2006 off = OffsetNumberNext(off))
2008 itemid = PageGetItemId(page, off);
2009 if (!ItemIdIsUsed(itemid))
2011 tuple.t_datamcxt = NULL;
2012 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2013 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
2015 if ((TransactionId) tuple.t_data->t_cmin != myXID)
2016 elog(ERROR, "Invalid XID in t_cmin (4)");
2017 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2018 elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
2019 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2021 /* some chains was moved while */
2022 if (chain_tuple_moved)
2023 { /* cleaning this page */
2024 Assert(vacpage->offsets_free > 0);
2025 for (i = 0; i < vacpage->offsets_free; i++)
2027 if (vacpage->offsets[i] == off)
2030 if (i >= vacpage->offsets_free) /* not found */
2032 vacpage->offsets[vacpage->offsets_free++] = off;
2033 Assert(keep_tuples > 0);
2039 vacpage->offsets[vacpage->offsets_free++] = off;
2040 Assert(keep_tuples > 0);
2047 if (vacpage->offsets_free > 0) /* some tuples were moved */
2049 if (chain_tuple_moved) /* else - they are ordered */
2051 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
2052 sizeof(OffsetNumber), vac_cmp_offno);
2054 vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2062 if (offnum <= maxoff)
2063 break; /* some item(s) left */
2065 } /* walk along relation */
2067 blkno++; /* new number of blocks */
2069 if (cur_buffer != InvalidBuffer)
2071 Assert(num_moved > 0);
2072 WriteBuffer(cur_buffer);
2078 * We have to commit our tuple movings before we truncate the
2079 * relation. Ideally we should do Commit/StartTransactionCommand
2080 * here, relying on the session-level table lock to protect our
2081 * exclusive access to the relation. However, that would require
2082 * a lot of extra code to close and re-open the relation, indexes,
2083 * etc. For now, a quick hack: record status of current
2084 * transaction as committed, and continue.
2086 RecordTransactionCommit();
2090 * We are not going to move any more tuples across pages, but we still
2091 * need to apply vacuum_page to compact free space in the remaining
2092 * pages in vacuum_pages list. Note that some of these pages may also
2093 * be in the fraged_pages list, and may have had tuples moved onto
2094 * them; if so, we already did vacuum_page and needn't do it again.
2096 for (i = 0, curpage = vacuum_pages->pagedesc;
2100 CHECK_FOR_INTERRUPTS();
2101 Assert((*curpage)->blkno < blkno);
2102 if ((*curpage)->offsets_used == 0)
2104 /* this page was not used as a move target, so must clean it */
2105 buf = ReadBuffer(onerel, (*curpage)->blkno);
2106 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2107 page = BufferGetPage(buf);
2108 if (!PageIsEmpty(page))
2109 vacuum_page(onerel, buf, *curpage);
2110 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2116 * Now scan all the pages that we moved tuples onto and update tuple
2117 * status bits. This is not really necessary, but will save time for
2118 * future transactions examining these tuples.
2120 * XXX WARNING that this code fails to clear HEAP_MOVED_OFF tuples from
2121 * pages that were move source pages but not move dest pages. One
2122 * also wonders whether it wouldn't be better to skip this step and
2123 * let the tuple status updates happen someplace that's not holding an
2124 * exclusive lock on the relation.
2127 for (i = 0, curpage = fraged_pages->pagedesc;
2128 i < num_fraged_pages;
2131 CHECK_FOR_INTERRUPTS();
2132 Assert((*curpage)->blkno < blkno);
2133 if ((*curpage)->blkno > last_move_dest_block)
2134 break; /* no need to scan any further */
2135 if ((*curpage)->offsets_used == 0)
2136 continue; /* this page was never used as a move dest */
2137 buf = ReadBuffer(onerel, (*curpage)->blkno);
2138 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2139 page = BufferGetPage(buf);
2141 max_offset = PageGetMaxOffsetNumber(page);
2142 for (newoff = FirstOffsetNumber;
2143 newoff <= max_offset;
2144 newoff = OffsetNumberNext(newoff))
2146 itemid = PageGetItemId(page, newoff);
2147 if (!ItemIdIsUsed(itemid))
2149 tuple.t_datamcxt = NULL;
2150 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2151 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2153 if ((TransactionId) tuple.t_data->t_cmin != myXID)
2154 elog(ERROR, "Invalid XID in t_cmin (2)");
2155 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2157 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
2160 else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2161 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
2163 elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2166 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2168 Assert((*curpage)->offsets_used == num_tuples);
2169 checked_moved += num_tuples;
2171 Assert(num_moved == checked_moved);
2173 elog(elevel, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u.\n\t%s",
2174 RelationGetRelationName(onerel),
2175 nblocks, blkno, num_moved,
2176 vac_show_rusage(&ru0));
2179 * Reflect the motion of system tuples to catalog cache here.
2181 CommandCounterIncrement();
2183 if (Nvacpagelist.num_pages > 0)
2185 /* vacuum indexes again if needed */
2186 if (Irel != (Relation *) NULL)
2192 /* re-sort Nvacpagelist.pagedesc */
2193 for (vpleft = Nvacpagelist.pagedesc,
2194 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2195 vpleft < vpright; vpleft++, vpright--)
2201 Assert(keep_tuples >= 0);
2202 for (i = 0; i < nindexes; i++)
2203 vacuum_index(&Nvacpagelist, Irel[i],
2204 vacrelstats->rel_tuples, keep_tuples);
2207 /* clean moved tuples from last page in Nvacpagelist list */
2208 if (vacpage->blkno == (blkno - 1) &&
2209 vacpage->offsets_free > 0)
2211 OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2212 OffsetNumber *unused = unbuf;
2215 buf = ReadBuffer(onerel, vacpage->blkno);
2216 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2217 page = BufferGetPage(buf);
2219 maxoff = PageGetMaxOffsetNumber(page);
2220 for (offnum = FirstOffsetNumber;
2222 offnum = OffsetNumberNext(offnum))
2224 itemid = PageGetItemId(page, offnum);
2225 if (!ItemIdIsUsed(itemid))
2227 tuple.t_datamcxt = NULL;
2228 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2230 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2232 if ((TransactionId) tuple.t_data->t_cmin != myXID)
2233 elog(ERROR, "Invalid XID in t_cmin (3)");
2234 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2236 itemid->lp_flags &= ~LP_USED;
2240 elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2244 Assert(vacpage->offsets_free == num_tuples);
2245 START_CRIT_SECTION();
2246 uncnt = PageRepairFragmentation(page, unused);
2250 recptr = log_heap_clean(onerel, buf, (char *) unused,
2251 (char *) (&(unused[uncnt])) - (char *) unused);
2252 PageSetLSN(page, recptr);
2253 PageSetSUI(page, ThisStartUpID);
2256 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2260 /* now - free new list of reaped pages */
2261 curpage = Nvacpagelist.pagedesc;
2262 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
2264 pfree(Nvacpagelist.pagedesc);
2268 * Flush dirty pages out to disk. We do this unconditionally, even if
2269 * we don't need to truncate, because we want to ensure that all
2270 * tuples have correct on-row commit status on disk (see bufmgr.c's
2271 * comments for FlushRelationBuffers()).
2273 i = FlushRelationBuffers(onerel, blkno);
2275 elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
2278 /* truncate relation, if needed */
2279 if (blkno < nblocks)
2281 blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2282 onerel->rd_nblocks = blkno; /* update relcache immediately */
2283 onerel->rd_targblock = InvalidBlockNumber;
2284 vacrelstats->rel_pages = blkno; /* set new number of blocks */
2289 if (vacrelstats->vtlinks != NULL)
2290 pfree(vacrelstats->vtlinks);
2292 ExecDropTupleTable(tupleTable, true);
2294 ExecCloseIndices(resultRelInfo);
2298 * vacuum_heap() -- free dead tuples
2300 * This routine marks dead tuples as unused and truncates relation
2301 * if there are "empty" end-blocks.
2304 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
2308 BlockNumber relblocks;
2312 nblocks = vacuum_pages->num_pages;
2313 nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with them */
2315 for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2317 CHECK_FOR_INTERRUPTS();
2318 if ((*vacpage)->offsets_free > 0)
2320 buf = ReadBuffer(onerel, (*vacpage)->blkno);
2321 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2322 vacuum_page(onerel, buf, *vacpage);
2323 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2329 * Flush dirty pages out to disk. We do this unconditionally, even if
2330 * we don't need to truncate, because we want to ensure that all
2331 * tuples have correct on-row commit status on disk (see bufmgr.c's
2332 * comments for FlushRelationBuffers()).
2334 Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2335 relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2337 i = FlushRelationBuffers(onerel, relblocks);
2339 elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
2342 /* truncate relation if there are some empty end-pages */
2343 if (vacuum_pages->empty_end_pages > 0)
2345 elog(elevel, "Rel %s: Pages: %u --> %u.",
2346 RelationGetRelationName(onerel),
2347 vacrelstats->rel_pages, relblocks);
2348 relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2349 onerel->rd_nblocks = relblocks; /* update relcache immediately */
2350 onerel->rd_targblock = InvalidBlockNumber;
2351 vacrelstats->rel_pages = relblocks; /* set new number of
2357 * vacuum_page() -- free dead tuples on a page
2358 * and repair its fragmentation.
2361 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
2363 OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2364 OffsetNumber *unused = unbuf;
2366 Page page = BufferGetPage(buffer);
2370 /* There shouldn't be any tuples moved onto the page yet! */
2371 Assert(vacpage->offsets_used == 0);
2373 START_CRIT_SECTION();
2374 for (i = 0; i < vacpage->offsets_free; i++)
2376 itemid = PageGetItemId(page, vacpage->offsets[i]);
2377 itemid->lp_flags &= ~LP_USED;
2379 uncnt = PageRepairFragmentation(page, unused);
2383 recptr = log_heap_clean(onerel, buffer, (char *) unused,
2384 (char *) (&(unused[uncnt])) - (char *) unused);
2385 PageSetLSN(page, recptr);
2386 PageSetSUI(page, ThisStartUpID);
2392 * scan_index() -- scan one index relation to update statistic.
2394 * We use this when we have no deletions to do.
2397 scan_index(Relation indrel, double num_tuples)
2399 IndexBulkDeleteResult *stats;
2402 vac_init_rusage(&ru0);
2405 * Even though we're not planning to delete anything, use the
2406 * ambulkdelete call, so that the scan happens within the index AM for
2409 stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2414 /* now update statistics in pg_class */
2415 vac_update_relstats(RelationGetRelid(indrel),
2416 stats->num_pages, stats->num_index_tuples,
2419 elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
2420 RelationGetRelationName(indrel),
2421 stats->num_pages, stats->num_index_tuples,
2422 vac_show_rusage(&ru0));
2425 * Check for tuple count mismatch. If the index is partial, then it's
2426 * OK for it to have fewer tuples than the heap; else we got trouble.
2428 if (stats->num_index_tuples != num_tuples)
2430 if (stats->num_index_tuples > num_tuples ||
2431 !vac_is_partial_index(indrel))
2432 elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2433 \n\tRecreate the index.",
2434 RelationGetRelationName(indrel),
2435 stats->num_index_tuples, num_tuples);
2442 * vacuum_index() -- vacuum one index relation.
2444 * Vpl is the VacPageList of the heap we're currently vacuuming.
2445 * It's locked. Indrel is an index relation on the vacuumed heap.
2447 * We don't bother to set locks on the index relation here, since
2448 * the parent table is exclusive-locked already.
2450 * Finally, we arrange to update the index relation's statistics in
2454 vacuum_index(VacPageList vacpagelist, Relation indrel,
2455 double num_tuples, int keep_tuples)
2457 IndexBulkDeleteResult *stats;
2460 vac_init_rusage(&ru0);
2462 /* Do bulk deletion */
2463 stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2468 /* now update statistics in pg_class */
2469 vac_update_relstats(RelationGetRelid(indrel),
2470 stats->num_pages, stats->num_index_tuples,
2473 elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
2474 RelationGetRelationName(indrel), stats->num_pages,
2475 stats->num_index_tuples - keep_tuples, stats->tuples_removed,
2476 vac_show_rusage(&ru0));
2479 * Check for tuple count mismatch. If the index is partial, then it's
2480 * OK for it to have fewer tuples than the heap; else we got trouble.
2482 if (stats->num_index_tuples != num_tuples + keep_tuples)
2484 if (stats->num_index_tuples > num_tuples + keep_tuples ||
2485 !vac_is_partial_index(indrel))
2486 elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2487 \n\tRecreate the index.",
2488 RelationGetRelationName(indrel),
2489 stats->num_index_tuples, num_tuples);
2496 * tid_reaped() -- is a particular tid reaped?
2498 * This has the right signature to be an IndexBulkDeleteCallback.
2500 * vacpagelist->VacPage_array is sorted in right order.
2503 tid_reaped(ItemPointer itemptr, void *state)
2505 VacPageList vacpagelist = (VacPageList) state;
2506 OffsetNumber ioffno;
2510 VacPageData vacpage;
2512 vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2513 ioffno = ItemPointerGetOffsetNumber(itemptr);
2516 vpp = (VacPage *) vac_bsearch((void *) &vp,
2517 (void *) (vacpagelist->pagedesc),
2518 vacpagelist->num_pages,
2525 /* ok - we are on a partially or fully reaped page */
2528 if (vp->offsets_free == 0)
2530 /* this is EmptyPage, so claim all tuples on it are reaped!!! */
2534 voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
2535 (void *) (vp->offsets),
2537 sizeof(OffsetNumber),
2548 * Dummy version for scan_index.
2551 dummy_tid_reaped(ItemPointer itemptr, void *state)
2557 * Update the shared Free Space Map with the info we now have about
2558 * free space in the relation, discarding any old info the map may have.
2561 vac_update_fsm(Relation onerel, VacPageList fraged_pages,
2562 BlockNumber rel_pages)
2564 int nPages = fraged_pages->num_pages;
2569 /* +1 to avoid palloc(0) */
2570 pages = (BlockNumber *) palloc((nPages + 1) * sizeof(BlockNumber));
2571 spaceAvail = (Size *) palloc((nPages + 1) * sizeof(Size));
2573 for (i = 0; i < nPages; i++)
2575 pages[i] = fraged_pages->pagedesc[i]->blkno;
2576 spaceAvail[i] = fraged_pages->pagedesc[i]->free;
2579 * fraged_pages may contain entries for pages that we later
2580 * decided to truncate from the relation; don't enter them into
2583 if (pages[i] >= rel_pages)
2590 MultiRecordFreeSpace(&onerel->rd_node,
2592 nPages, pages, spaceAvail);
2597 /* Copy a VacPage structure */
2599 copy_vac_page(VacPage vacpage)
2603 /* allocate a VacPageData entry */
2604 newvacpage = (VacPage) palloc(sizeof(VacPageData) +
2605 vacpage->offsets_free * sizeof(OffsetNumber));
2608 if (vacpage->offsets_free > 0)
2609 memcpy(newvacpage->offsets, vacpage->offsets,
2610 vacpage->offsets_free * sizeof(OffsetNumber));
2611 newvacpage->blkno = vacpage->blkno;
2612 newvacpage->free = vacpage->free;
2613 newvacpage->offsets_used = vacpage->offsets_used;
2614 newvacpage->offsets_free = vacpage->offsets_free;
2620 * Add a VacPage pointer to a VacPageList.
2622 * As a side effect of the way that scan_heap works,
2623 * higher pages come after lower pages in the array
2624 * (and highest tid on a page is last).
2627 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
2629 #define PG_NPAGEDESC 1024
2631 /* allocate a VacPage entry if needed */
2632 if (vacpagelist->num_pages == 0)
2634 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
2635 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
2637 else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
2639 vacpagelist->num_allocated_pages *= 2;
2640 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
2642 vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
2643 (vacpagelist->num_pages)++;
2647 * vac_bsearch: just like standard C library routine bsearch(),
2648 * except that we first test to see whether the target key is outside
2649 * the range of the table entries. This case is handled relatively slowly
2650 * by the normal binary search algorithm (ie, no faster than any other key)
2651 * but it occurs often enough in VACUUM to be worth optimizing.
2654 vac_bsearch(const void *key, const void *base,
2655 size_t nelem, size_t size,
2656 int (*compar) (const void *, const void *))
2663 res = compar(key, base);
2667 return (void *) base;
2670 last = (const void *) ((const char *) base + (nelem - 1) * size);
2671 res = compar(key, last);
2675 return (void *) last;
2678 return NULL; /* already checked 'em all */
2679 return bsearch(key, base, nelem, size, compar);
2683 * Comparator routines for use with qsort() and bsearch().
2686 vac_cmp_blk(const void *left, const void *right)
2691 lblk = (*((VacPage *) left))->blkno;
2692 rblk = (*((VacPage *) right))->blkno;
2702 vac_cmp_offno(const void *left, const void *right)
2704 if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2706 if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2712 vac_cmp_vtlinks(const void *left, const void *right)
2714 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
2715 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2717 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
2718 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2720 /* bi_hi-es are equal */
2721 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
2722 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2724 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
2725 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2727 /* bi_lo-es are equal */
2728 if (((VTupleLink) left)->new_tid.ip_posid <
2729 ((VTupleLink) right)->new_tid.ip_posid)
2731 if (((VTupleLink) left)->new_tid.ip_posid >
2732 ((VTupleLink) right)->new_tid.ip_posid)
2739 vac_open_indexes(Relation relation, int *nindexes, Relation **Irel)
2745 indexoidlist = RelationGetIndexList(relation);
2747 *nindexes = length(indexoidlist);
2750 *Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
2755 foreach(indexoidscan, indexoidlist)
2757 Oid indexoid = lfirsti(indexoidscan);
2759 (*Irel)[i] = index_open(indexoid);
2763 freeList(indexoidlist);
2768 vac_close_indexes(int nindexes, Relation *Irel)
2770 if (Irel == (Relation *) NULL)
2774 index_close(Irel[nindexes]);
2780 * Is an index partial (ie, could it contain fewer tuples than the heap?)
2783 vac_is_partial_index(Relation indrel)
2786 * If the index's AM doesn't support nulls, it's partial for our
2789 if (!indrel->rd_am->amindexnulls)
2792 /* Otherwise, look to see if there's a partial-index predicate */
2793 return (VARSIZE(&indrel->rd_index->indpred) > VARHDRSZ);
2798 enough_space(VacPage vacpage, Size len)
2800 len = MAXALIGN(len);
2802 if (len > vacpage->free)
2805 /* if there are free itemid(s) and len <= free_space... */
2806 if (vacpage->offsets_used < vacpage->offsets_free)
2809 /* noff_used >= noff_free and so we'll have to allocate new itemid */
2810 if (len + sizeof(ItemIdData) <= vacpage->free)
2818 * Initialize usage snapshot.
2821 vac_init_rusage(VacRUsage *ru0)
2825 getrusage(RUSAGE_SELF, &ru0->ru);
2826 gettimeofday(&ru0->tv, &tz);
2830 * Compute elapsed time since ru0 usage snapshot, and format into
2831 * a displayable string. Result is in a static string, which is
2832 * tacky, but no one ever claimed that the Postgres backend is
2836 vac_show_rusage(VacRUsage *ru0)
2838 static char result[100];
2841 vac_init_rusage(&ru1);
2843 if (ru1.tv.tv_usec < ru0->tv.tv_usec)
2846 ru1.tv.tv_usec += 1000000;
2848 if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
2850 ru1.ru.ru_stime.tv_sec--;
2851 ru1.ru.ru_stime.tv_usec += 1000000;
2853 if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
2855 ru1.ru.ru_utime.tv_sec--;
2856 ru1.ru.ru_utime.tv_usec += 1000000;
2859 snprintf(result, sizeof(result),
2860 "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
2861 (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
2862 (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
2863 (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
2864 (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
2865 (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
2866 (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);