* vacuum.c
* The postgres vacuum cleaner.
*
- * This file includes the "full" version of VACUUM, as well as control code
- * used by all three of full VACUUM, lazy VACUUM, and ANALYZE. See
- * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
+ * This file now includes only control and dispatch code for VACUUM and
+ * ANALYZE commands. Regular VACUUM is implemented in vacuumlazy.c,
+ * ANALYZE in analyze.c, and VACUUM FULL is a variant of CLUSTER, handled
+ * in cluster.c.
*
*
- * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.370 2008/03/26 18:48:59 alvherre Exp $
+ * src/backend/commands/vacuum.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include <sys/time.h>
-#include <unistd.h>
+#include <math.h>
#include "access/clog.h"
#include "access/genam.h"
#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/multixact.h"
#include "access/transam.h"
#include "access/xact.h"
-#include "access/xlog.h"
#include "catalog/namespace.h"
#include "catalog/pg_database.h"
#include "catalog/pg_namespace.h"
-#include "commands/dbcommands.h"
+#include "commands/cluster.h"
#include "commands/vacuum.h"
-#include "executor/executor.h"
#include "miscadmin.h"
+#include "pgstat.h"
#include "postmaster/autovacuum.h"
-#include "storage/freespace.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/acl.h"
-#include "utils/builtins.h"
-#include "utils/flatfiles.h"
#include "utils/fmgroids.h"
-#include "utils/inval.h"
-#include "utils/lsyscache.h"
+#include "utils/guc.h"
#include "utils/memutils.h"
-#include "utils/pg_rusage.h"
-#include "utils/relcache.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
-#include "pgstat.h"
+#include "utils/tqual.h"
/*
* GUC parameters
*/
int vacuum_freeze_min_age;
+int vacuum_freeze_table_age;
+int vacuum_multixact_freeze_min_age;
+int vacuum_multixact_freeze_table_age;
-/*
- * VacPage structures keep track of each page on which we find useful
- * amounts of free space.
- */
-typedef struct VacPageData
-{
- BlockNumber blkno; /* BlockNumber of this Page */
- Size free; /* FreeSpace on this Page */
- uint16 offsets_used; /* Number of OffNums used by vacuum */
- uint16 offsets_free; /* Number of OffNums free or to be free */
- OffsetNumber offsets[1]; /* Array of free OffNums */
-} VacPageData;
-
-typedef VacPageData *VacPage;
-
-typedef struct VacPageListData
-{
- BlockNumber empty_end_pages; /* Number of "empty" end-pages */
- int num_pages; /* Number of pages in pagedesc */
- int num_allocated_pages; /* Number of allocated pages in
- * pagedesc */
- VacPage *pagedesc; /* Descriptions of pages */
-} VacPageListData;
-
-typedef VacPageListData *VacPageList;
-
-/*
- * The "vtlinks" array keeps information about each recently-updated tuple
- * ("recent" meaning its XMAX is too new to let us recycle the tuple).
- * We store the tuple's own TID as well as its t_ctid (its link to the next
- * newer tuple version). Searching in this array allows us to follow update
- * chains backwards from newer to older tuples. When we move a member of an
- * update chain, we must move *all* the live members of the chain, so that we
- * can maintain their t_ctid link relationships (we must not just overwrite
- * t_ctid in an existing tuple).
- *
- * Note: because t_ctid links can be stale (this would only occur if a prior
- * VACUUM crashed partway through), it is possible that new_tid points to an
- * empty slot or unrelated tuple. We have to check the linkage as we follow
- * it, just as is done in EvalPlanQual.
- */
-typedef struct VTupleLinkData
-{
- ItemPointerData new_tid; /* t_ctid of an updated tuple */
- ItemPointerData this_tid; /* t_self of the tuple */
-} VTupleLinkData;
-
-typedef VTupleLinkData *VTupleLink;
-
-/*
- * We use an array of VTupleMoveData to plan a chain tuple move fully
- * before we do it.
- */
-typedef struct VTupleMoveData
-{
- ItemPointerData tid; /* tuple ID */
- VacPage vacpage; /* where to move it to */
- bool cleanVpd; /* clean vacpage before using? */
-} VTupleMoveData;
-
-typedef VTupleMoveData *VTupleMove;
-
-/*
- * VRelStats contains the data acquired by scan_heap for use later
- */
-typedef struct VRelStats
-{
- /* miscellaneous statistics */
- BlockNumber rel_pages; /* pages in relation */
- double rel_tuples; /* tuples that remain after vacuuming */
- double rel_indexed_tuples; /* indexed tuples that remain */
- Size min_tlen; /* min surviving tuple size */
- Size max_tlen; /* max surviving tuple size */
- bool hasindex;
- /* vtlinks array for tuple chain following - sorted by new_tid */
- int num_vtlinks;
- VTupleLink vtlinks;
-} VRelStats;
-
-/*----------------------------------------------------------------------
- * ExecContext:
- *
- * As these variables always appear together, we put them into one struct
- * and pull initialization and cleanup into separate routines.
- * ExecContext is used by repair_frag() and move_xxx_tuple(). More
- * accurately: It is *used* only in move_xxx_tuple(), but because this
- * routine is called many times, we initialize the struct just once in
- * repair_frag() and pass it on to move_xxx_tuple().
- */
-typedef struct ExecContextData
-{
- ResultRelInfo *resultRelInfo;
- EState *estate;
- TupleTableSlot *slot;
-} ExecContextData;
-
-typedef ExecContextData *ExecContext;
-
-static void
-ExecContext_Init(ExecContext ec, Relation rel)
-{
- TupleDesc tupdesc = RelationGetDescr(rel);
-
- /*
- * We need a ResultRelInfo and an EState so we can use the regular
- * executor's index-entry-making machinery.
- */
- ec->estate = CreateExecutorState();
-
- ec->resultRelInfo = makeNode(ResultRelInfo);
- ec->resultRelInfo->ri_RangeTableIndex = 1; /* dummy */
- ec->resultRelInfo->ri_RelationDesc = rel;
- ec->resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */
-
- ExecOpenIndices(ec->resultRelInfo);
-
- ec->estate->es_result_relations = ec->resultRelInfo;
- ec->estate->es_num_result_relations = 1;
- ec->estate->es_result_relation_info = ec->resultRelInfo;
-
- /* Set up a tuple slot too */
- ec->slot = MakeSingleTupleTableSlot(tupdesc);
-}
-
-static void
-ExecContext_Finish(ExecContext ec)
-{
- ExecDropSingleTupleTableSlot(ec->slot);
- ExecCloseIndices(ec->resultRelInfo);
- FreeExecutorState(ec->estate);
-}
-
-/*
- * End of ExecContext Implementation
- *----------------------------------------------------------------------
- */
/* A few variables that don't seem worth passing around as parameters */
static MemoryContext vac_context = NULL;
-
-static int elevel = -1;
-
-static TransactionId OldestXmin;
-static TransactionId FreezeLimit;
-
static BufferAccessStrategy vac_strategy;
/* non-export function prototypes */
-static List *get_rel_oids(List *relids, const RangeVar *vacrel,
- const char *stmttype);
-static void vac_truncate_clog(TransactionId frozenXID);
-static void vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind,
- bool for_wraparound);
-static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
-static void scan_heap(VRelStats *vacrelstats, Relation onerel,
- VacPageList vacuum_pages, VacPageList fraged_pages);
-static void repair_frag(VRelStats *vacrelstats, Relation onerel,
- VacPageList vacuum_pages, VacPageList fraged_pages,
- int nindexes, Relation *Irel);
-static void move_chain_tuple(Relation rel,
- Buffer old_buf, Page old_page, HeapTuple old_tup,
- Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
- ExecContext ec, ItemPointer ctid, bool cleanVpd);
-static void move_plain_tuple(Relation rel,
- Buffer old_buf, Page old_page, HeapTuple old_tup,
- Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
- ExecContext ec);
-static void update_hint_bits(Relation rel, VacPageList fraged_pages,
- int num_fraged_pages, BlockNumber last_move_dest_block,
- int num_moved);
-static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
- VacPageList vacpagelist);
-static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
-static void vacuum_index(VacPageList vacpagelist, Relation indrel,
- double num_tuples, int keep_tuples);
-static void scan_index(Relation indrel, double num_tuples);
-static bool tid_reaped(ItemPointer itemptr, void *state);
-static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
- BlockNumber rel_pages);
-static VacPage copy_vac_page(VacPage vacpage);
-static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
-static void *vac_bsearch(const void *key, const void *base,
- size_t nelem, size_t size,
- int (*compar) (const void *, const void *));
-static int vac_cmp_blk(const void *left, const void *right);
-static int vac_cmp_offno(const void *left, const void *right);
-static int vac_cmp_vtlinks(const void *left, const void *right);
-static bool enough_space(VacPage vacpage, Size len);
-static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page);
-
-
-/****************************************************************************
- * *
- * Code common to all flavors of VACUUM and ANALYZE *
- * *
- ****************************************************************************
- */
+static List *get_rel_oids(Oid relid, const RangeVar *vacrel);
+static void vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti);
+static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast,
+ bool for_wraparound);
/*
* Primary entry point for VACUUM and ANALYZE commands.
*
- * relids is normally NIL; if it is not, then it provides the list of
- * relation OIDs to be processed, and vacstmt->relation is ignored.
- * (The non-NIL case is currently only used by autovacuum.)
+ * relid is normally InvalidOid; if it is not, then it provides the relation
+ * OID to be processed, and vacstmt->relation is ignored. (The non-invalid
+ * case is currently only used by autovacuum.)
+ *
+ * do_toast is passed as FALSE by autovacuum, because it processes TOAST
+ * tables separately.
*
* for_wraparound is used by autovacuum to let us know when it's forcing
- * a vacuum for wraparound, which should not be auto-cancelled.
+ * a vacuum for wraparound, which should not be auto-canceled.
*
* bstrategy is normally given as NULL, but in autovacuum it can be passed
* in to use the same buffer strategy object across multiple vacuum() calls.
*
* isTopLevel should be passed down from ProcessUtility.
*
- * It is the caller's responsibility that vacstmt, relids, and bstrategy
+ * It is the caller's responsibility that vacstmt and bstrategy
* (if given) be allocated in a memory context that won't disappear
* at transaction commit.
*/
void
-vacuum(VacuumStmt *vacstmt, List *relids,
+vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast,
BufferAccessStrategy bstrategy, bool for_wraparound, bool isTopLevel)
{
- const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
- volatile MemoryContext anl_context = NULL;
- volatile bool all_rels,
- in_outer_xact,
+ const char *stmttype;
+ volatile bool in_outer_xact,
use_own_xacts;
List *relations;
- if (vacstmt->verbose)
- elevel = INFO;
- else
- elevel = DEBUG2;
+ /* sanity checks on options */
+ Assert(vacstmt->options & (VACOPT_VACUUM | VACOPT_ANALYZE));
+ Assert((vacstmt->options & VACOPT_VACUUM) ||
+ !(vacstmt->options & (VACOPT_FULL | VACOPT_FREEZE)));
+ Assert((vacstmt->options & VACOPT_ANALYZE) || vacstmt->va_cols == NIL);
+
+ stmttype = (vacstmt->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE";
/*
* We cannot run VACUUM inside a user transaction block; if we were inside
* a transaction, then our commit- and start-transaction-command calls
- * would not have the intended effect! Furthermore, the forced commit that
- * occurs before truncating the relation's file would have the effect of
- * committing the rest of the user's transaction too, which would
- * certainly not be the desired behavior. (This only applies to VACUUM
- * FULL, though. We could in theory run lazy VACUUM inside a transaction
- * block, but we choose to disallow that case because we'd rather commit
- * as soon as possible after finishing the vacuum. This is mainly so that
- * we can let go the AccessExclusiveLock that we may be holding.)
+ * would not have the intended effect! There are numerous other subtle
+ * dependencies on this, too.
*
* ANALYZE (without VACUUM) can run either way.
*/
- if (vacstmt->vacuum)
+ if (vacstmt->options & VACOPT_VACUUM)
{
PreventTransactionChain(isTopLevel, stmttype);
in_outer_xact = false;
* Send info about dead objects to the statistics collector, unless we are
* in autovacuum --- autovacuum.c does this for itself.
*/
- if (vacstmt->vacuum && !IsAutoVacuumWorkerProcess())
- pgstat_vacuum_tabstat();
+ if ((vacstmt->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess())
+ pgstat_vacuum_stat();
/*
* Create special memory context for cross-transaction storage.
}
vac_strategy = bstrategy;
- /* Remember whether we are processing everything in the DB */
- all_rels = (relids == NIL && vacstmt->relation == NULL);
-
/*
* Build list of relations to process, unless caller gave us one. (If we
* build one, we put it in vac_context for safekeeping.)
*/
- relations = get_rel_oids(relids, vacstmt->relation, stmttype);
+ relations = get_rel_oids(relid, vacstmt->relation);
/*
* Decide whether we need to start/commit our own transactions.
* transaction block, and also in an autovacuum worker, use own
* transactions so we can release locks sooner.
*/
- if (vacstmt->vacuum)
+ if (vacstmt->options & VACOPT_VACUUM)
use_own_xacts = true;
else
{
- Assert(vacstmt->analyze);
+ Assert(vacstmt->options & VACOPT_ANALYZE);
if (IsAutoVacuumWorkerProcess())
use_own_xacts = true;
else if (in_outer_xact)
use_own_xacts = false;
}
- /*
- * If we are running ANALYZE without per-table transactions, we'll need a
- * memory context with table lifetime.
- */
- if (!use_own_xacts)
- anl_context = AllocSetContextCreate(PortalContext,
- "Analyze",
- ALLOCSET_DEFAULT_MINSIZE,
- ALLOCSET_DEFAULT_INITSIZE,
- ALLOCSET_DEFAULT_MAXSIZE);
-
/*
* vacuum_rel expects to be entered with no transaction active; it will
* start and commit its own transaction. But we are called by an SQL
*/
if (use_own_xacts)
{
+ /* ActiveSnapshot is not set by autovacuum */
+ if (ActiveSnapshotSet())
+ PopActiveSnapshot();
+
/* matches the StartTransaction in PostgresMain() */
CommitTransactionCommand();
}
VacuumCostActive = (VacuumCostDelay > 0);
VacuumCostBalance = 0;
+ VacuumPageHit = 0;
+ VacuumPageMiss = 0;
+ VacuumPageDirty = 0;
/*
* Loop to process each selected relation.
{
Oid relid = lfirst_oid(cur);
- if (vacstmt->vacuum)
- vacuum_rel(relid, vacstmt, RELKIND_RELATION, for_wraparound);
-
- if (vacstmt->analyze)
+ if (vacstmt->options & VACOPT_VACUUM)
{
- MemoryContext old_context = NULL;
+ if (!vacuum_rel(relid, vacstmt, do_toast, for_wraparound))
+ continue;
+ }
+ if (vacstmt->options & VACOPT_ANALYZE)
+ {
/*
* If using separate xacts, start one for analyze. Otherwise,
- * we can use the outer transaction, but we still need to call
- * analyze_rel in a memory context that will be cleaned up on
- * return (else we leak memory while processing multiple
- * tables).
+ * we can use the outer transaction.
*/
if (use_own_xacts)
{
StartTransactionCommand();
/* functions in indexes may want a snapshot set */
- ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
+ PushActiveSnapshot(GetTransactionSnapshot());
}
- else
- old_context = MemoryContextSwitchTo(anl_context);
analyze_rel(relid, vacstmt, vac_strategy);
if (use_own_xacts)
- CommitTransactionCommand();
- else
{
- MemoryContextSwitchTo(old_context);
- MemoryContextResetAndDeleteChildren(anl_context);
+ PopActiveSnapshot();
+ CommitTransactionCommand();
}
}
}
StartTransactionCommand();
}
- if (vacstmt->vacuum && !IsAutoVacuumWorkerProcess())
+ if ((vacstmt->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess())
{
/*
* Update pg_database.datfrozenxid, and truncate pg_clog if possible.
* (autovacuum.c does this for itself.)
*/
vac_update_datfrozenxid();
-
- /*
- * If it was a database-wide VACUUM, print FSM usage statistics (we
- * don't make you be superuser to see these). We suppress this in
- * autovacuum, too.
- */
- if (all_rels)
- PrintFreeSpaceMapStatistics(elevel);
}
/*
*/
MemoryContextDelete(vac_context);
vac_context = NULL;
-
- if (anl_context)
- MemoryContextDelete(anl_context);
}
/*
* per-relation transactions.
*/
static List *
-get_rel_oids(List *relids, const RangeVar *vacrel, const char *stmttype)
+get_rel_oids(Oid relid, const RangeVar *vacrel)
{
List *oid_list = NIL;
MemoryContext oldcontext;
- /* List supplied by VACUUM's caller? */
- if (relids)
- return relids;
-
- if (vacrel)
+ /* OID supplied by VACUUM's caller? */
+ if (OidIsValid(relid))
+ {
+ oldcontext = MemoryContextSwitchTo(vac_context);
+ oid_list = lappend_oid(oid_list, relid);
+ MemoryContextSwitchTo(oldcontext);
+ }
+ else if (vacrel)
{
/* Process a specific relation */
Oid relid;
- relid = RangeVarGetRelid(vacrel, false);
+ /*
+ * Since we don't take a lock here, the relation might be gone, or the
+ * RangeVar might no longer refer to the OID we look up here. In the
+ * former case, VACUUM will do nothing; in the latter case, it will
+ * process the OID we looked up here, rather than the new one. Neither
+ * is ideal, but there's little practical alternative, since we're
+ * going to commit this transaction and begin a new one between now
+ * and then.
+ */
+ relid = RangeVarGetRelid(vacrel, NoLock, false);
/* Make a relation list entry for this guy */
oldcontext = MemoryContextSwitchTo(vac_context);
}
else
{
- /* Process all plain relations listed in pg_class */
+ /*
+ * Process all plain relations and materialized views listed in
+ * pg_class
+ */
Relation pgclass;
HeapScanDesc scan;
HeapTuple tuple;
- ScanKeyData key;
-
- ScanKeyInit(&key,
- Anum_pg_class_relkind,
- BTEqualStrategyNumber, F_CHAREQ,
- CharGetDatum(RELKIND_RELATION));
pgclass = heap_open(RelationRelationId, AccessShareLock);
- scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
+ scan = heap_beginscan_catalog(pgclass, 0, NULL);
while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
+ Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
+
+ if (classForm->relkind != RELKIND_RELATION &&
+ classForm->relkind != RELKIND_MATVIEW)
+ continue;
+
/* Make a relation list entry for this guy */
oldcontext = MemoryContextSwitchTo(vac_context);
oid_list = lappend_oid(oid_list, HeapTupleGetOid(tuple));
/*
* vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
+ *
+ * The output parameters are:
+ * - oldestXmin is the cutoff value used to distinguish whether tuples are
+ * DEAD or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ * - freezeLimit is the Xid below which all Xids are replaced by
+ * FrozenTransactionId during vacuum.
+ * - xidFullScanLimit (computed from table_freeze_age parameter)
+ * represents a minimum Xid value; a table whose relfrozenxid is older than
+ * this will have a full-table vacuum applied to it, to freeze tuples across
+ * the whole table. Vacuuming a table younger than this value can use a
+ * partial scan.
+ * - multiXactCutoff is the value below which all MultiXactIds are removed from
+ * Xmax.
+ * - mxactFullScanLimit is a value against which a table's relminmxid value is
+ * compared to produce a full-table vacuum, as with xidFullScanLimit.
+ *
+ * xidFullScanLimit and mxactFullScanLimit can be passed as NULL if caller is
+ * not interested.
*/
void
-vacuum_set_xid_limits(int freeze_min_age, bool sharedRel,
+vacuum_set_xid_limits(Relation rel,
+ int freeze_min_age,
+ int freeze_table_age,
+ int multixact_freeze_min_age,
+ int multixact_freeze_table_age,
TransactionId *oldestXmin,
- TransactionId *freezeLimit)
+ TransactionId *freezeLimit,
+ TransactionId *xidFullScanLimit,
+ MultiXactId *multiXactCutoff,
+ MultiXactId *mxactFullScanLimit)
{
int freezemin;
+ int mxid_freezemin;
TransactionId limit;
TransactionId safeLimit;
+ MultiXactId mxactLimit;
+ MultiXactId safeMxactLimit;
/*
- * We can always ignore processes running lazy vacuum. This is because we
+ * We can always ignore processes running lazy vacuum. This is because we
* use these values only for deciding which tuples we must keep in the
- * tables. Since lazy vacuum doesn't write its XID anywhere, it's safe to
- * ignore it. In theory it could be problematic to ignore lazy vacuums on
+ * tables. Since lazy vacuum doesn't write its XID anywhere, it's safe to
+ * ignore it. In theory it could be problematic to ignore lazy vacuums in
* a full vacuum, but keep in mind that only one vacuum process can be
* working on a particular table at any time, and that each vacuum is
* always an independent transaction.
*/
- *oldestXmin = GetOldestXmin(sharedRel, true);
+ *oldestXmin = GetOldestXmin(rel, true);
Assert(TransactionIdIsNormal(*oldestXmin));
}
*freezeLimit = limit;
+
+ /*
+ * Determine the minimum multixact freeze age to use: as specified by
+ * caller, or vacuum_multixact_freeze_min_age, but in any case not more
+ * than half autovacuum_multixact_freeze_max_age, so that autovacuums to
+ * prevent MultiXact wraparound won't occur too frequently.
+ */
+ mxid_freezemin = multixact_freeze_min_age;
+ if (mxid_freezemin < 0)
+ mxid_freezemin = vacuum_multixact_freeze_min_age;
+ mxid_freezemin = Min(mxid_freezemin,
+ autovacuum_multixact_freeze_max_age / 2);
+ Assert(mxid_freezemin >= 0);
+
+ /* compute the cutoff multi, being careful to generate a valid value */
+ mxactLimit = GetOldestMultiXactId() - mxid_freezemin;
+ if (mxactLimit < FirstMultiXactId)
+ mxactLimit = FirstMultiXactId;
+
+ safeMxactLimit =
+ ReadNextMultiXactId() - autovacuum_multixact_freeze_max_age;
+ if (safeMxactLimit < FirstMultiXactId)
+ safeMxactLimit = FirstMultiXactId;
+
+ if (MultiXactIdPrecedes(mxactLimit, safeMxactLimit))
+ {
+ ereport(WARNING,
+ (errmsg("oldest multixact is far in the past"),
+ errhint("Close open transactions with multixacts soon to avoid wraparound problems.")));
+ mxactLimit = safeMxactLimit;
+ }
+
+ *multiXactCutoff = mxactLimit;
+
+ if (xidFullScanLimit != NULL)
+ {
+ int freezetable;
+
+ Assert(mxactFullScanLimit != NULL);
+
+ /*
+ * Determine the table freeze age to use: as specified by the caller,
+ * or vacuum_freeze_table_age, but in any case not more than
+ * autovacuum_freeze_max_age * 0.95, so that if you have e.g nightly
+ * VACUUM schedule, the nightly VACUUM gets a chance to freeze tuples
+ * before anti-wraparound autovacuum is launched.
+ */
+ freezetable = freeze_table_age;
+ if (freezetable < 0)
+ freezetable = vacuum_freeze_table_age;
+ freezetable = Min(freezetable, autovacuum_freeze_max_age * 0.95);
+ Assert(freezetable >= 0);
+
+ /*
+ * Compute XID limit causing a full-table vacuum, being careful not to
+ * generate a "permanent" XID.
+ */
+ limit = ReadNewTransactionId() - freezetable;
+ if (!TransactionIdIsNormal(limit))
+ limit = FirstNormalTransactionId;
+
+ *xidFullScanLimit = limit;
+
+ /*
+ * Similar to the above, determine the table freeze age to use for
+ * multixacts: as specified by the caller, or
+ * vacuum_multixact_freeze_table_age, but in any case not more than
+ * autovacuum_multixact_freeze_table_age * 0.95, so that if you have
+ * e.g. nightly VACUUM schedule, the nightly VACUUM gets a chance to
+ * freeze multixacts before anti-wraparound autovacuum is launched.
+ */
+ freezetable = multixact_freeze_table_age;
+ if (freezetable < 0)
+ freezetable = vacuum_multixact_freeze_table_age;
+ freezetable = Min(freezetable,
+ autovacuum_multixact_freeze_max_age * 0.95);
+ Assert(freezetable >= 0);
+
+ /*
+ * Compute MultiXact limit causing a full-table vacuum, being careful
+ * to generate a valid MultiXact value.
+ */
+ mxactLimit = ReadNextMultiXactId() - freezetable;
+ if (mxactLimit < FirstMultiXactId)
+ mxactLimit = FirstMultiXactId;
+
+ *mxactFullScanLimit = mxactLimit;
+ }
+ else
+ {
+ Assert(mxactFullScanLimit == NULL);
+ }
+}
+
+/*
+ * vac_estimate_reltuples() -- estimate the new value for pg_class.reltuples
+ *
+ * If we scanned the whole relation then we should just use the count of
+ * live tuples seen; but if we did not, we should not trust the count
+ * unreservedly, especially not in VACUUM, which may have scanned a quite
+ * nonrandom subset of the table. When we have only partial information,
+ * we take the old value of pg_class.reltuples as a measurement of the
+ * tuple density in the unscanned pages.
+ *
+ * This routine is shared by VACUUM and ANALYZE.
+ */
+double
+vac_estimate_reltuples(Relation relation, bool is_analyze,
+ BlockNumber total_pages,
+ BlockNumber scanned_pages,
+ double scanned_tuples)
+{
+ BlockNumber old_rel_pages = relation->rd_rel->relpages;
+ double old_rel_tuples = relation->rd_rel->reltuples;
+ double old_density;
+ double new_density;
+ double multiplier;
+ double updated_density;
+
+ /* If we did scan the whole table, just use the count as-is */
+ if (scanned_pages >= total_pages)
+ return scanned_tuples;
+
+ /*
+ * If scanned_pages is zero but total_pages isn't, keep the existing value
+ * of reltuples. (Note: callers should avoid updating the pg_class
+ * statistics in this situation, since no new information has been
+ * provided.)
+ */
+ if (scanned_pages == 0)
+ return old_rel_tuples;
+
+ /*
+ * If old value of relpages is zero, old density is indeterminate; we
+ * can't do much except scale up scanned_tuples to match total_pages.
+ */
+ if (old_rel_pages == 0)
+ return floor((scanned_tuples / scanned_pages) * total_pages + 0.5);
+
+ /*
+ * Okay, we've covered the corner cases. The normal calculation is to
+ * convert the old measurement to a density (tuples per page), then update
+ * the density using an exponential-moving-average approach, and finally
+ * compute reltuples as updated_density * total_pages.
+ *
+ * For ANALYZE, the moving average multiplier is just the fraction of the
+ * table's pages we scanned. This is equivalent to assuming that the
+ * tuple density in the unscanned pages didn't change. Of course, it
+ * probably did, if the new density measurement is different. But over
+ * repeated cycles, the value of reltuples will converge towards the
+ * correct value, if repeated measurements show the same new density.
+ *
+ * For VACUUM, the situation is a bit different: we have looked at a
+ * nonrandom sample of pages, but we know for certain that the pages we
+ * didn't look at are precisely the ones that haven't changed lately.
+ * Thus, there is a reasonable argument for doing exactly the same thing
+ * as for the ANALYZE case, that is use the old density measurement as the
+ * value for the unscanned pages.
+ *
+ * This logic could probably use further refinement.
+ */
+ old_density = old_rel_tuples / old_rel_pages;
+ new_density = scanned_tuples / scanned_pages;
+ multiplier = (double) scanned_pages / (double) total_pages;
+ updated_density = old_density + (new_density - old_density) * multiplier;
+ return floor(updated_density * total_pages + 0.5);
}
* pg_class would've been obsoleted. Of course, this only works for
* fixed-size never-null columns, but these are.
*
+ * Note another assumption: that two VACUUMs/ANALYZEs on a table can't
+ * run in parallel, nor can VACUUM/ANALYZE run in parallel with a
+ * schema alteration such as adding an index, rule, or trigger. Otherwise
+ * our updates of relhasindex etc might overwrite uncommitted updates.
+ *
* Another reason for doing it this way is that when we are in a lazy
* VACUUM and have PROC_IN_VACUUM set, we mustn't do any updates ---
* somebody vacuuming pg_class might think they could delete a tuple
* marked with xmin = our xid.
*
- * This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
- * ANALYZE.
+ * This routine is shared by VACUUM and ANALYZE.
*/
void
-vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
- bool hasindex, TransactionId frozenxid)
+vac_update_relstats(Relation relation,
+ BlockNumber num_pages, double num_tuples,
+ BlockNumber num_all_visible_pages,
+ bool hasindex, TransactionId frozenxid,
+ MultiXactId minmulti)
{
+ Oid relid = RelationGetRelid(relation);
Relation rd;
HeapTuple ctup;
Form_pg_class pgcform;
rd = heap_open(RelationRelationId, RowExclusiveLock);
/* Fetch a copy of the tuple to scribble on */
- ctup = SearchSysCacheCopy(RELOID,
- ObjectIdGetDatum(relid),
- 0, 0, 0);
+ ctup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
if (!HeapTupleIsValid(ctup))
elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
relid);
pgcform->reltuples = (float4) num_tuples;
dirty = true;
}
+ if (pgcform->relallvisible != (int32) num_all_visible_pages)
+ {
+ pgcform->relallvisible = (int32) num_all_visible_pages;
+ dirty = true;
+ }
if (pgcform->relhasindex != hasindex)
{
pgcform->relhasindex = hasindex;
/*
* If we have discovered that there are no indexes, then there's no
- * primary key either. This could be done more thoroughly...
+ * primary key either. This could be done more thoroughly...
*/
- if (!hasindex)
+ if (pgcform->relhaspkey && !hasindex)
{
- if (pgcform->relhaspkey)
- {
- pgcform->relhaspkey = false;
- dirty = true;
- }
+ pgcform->relhaspkey = false;
+ dirty = true;
+ }
+
+ /* We also clear relhasrules and relhastriggers if needed */
+ if (pgcform->relhasrules && relation->rd_rules == NULL)
+ {
+ pgcform->relhasrules = false;
+ dirty = true;
+ }
+ if (pgcform->relhastriggers && relation->trigdesc == NULL)
+ {
+ pgcform->relhastriggers = false;
+ dirty = true;
}
/*
dirty = true;
}
- /*
- * If anything changed, write out the tuple. Even if nothing changed,
- * force relcache invalidation so all backends reset their rd_targblock
- * --- otherwise it might point to a page we truncated away.
- */
- if (dirty)
+ /* relminmxid must never go backward, either */
+ if (MultiXactIdIsValid(minmulti) &&
+ MultiXactIdPrecedes(pgcform->relminmxid, minmulti))
{
- heap_inplace_update(rd, ctup);
- /* the above sends a cache inval message */
- }
- else
- {
- /* no need to change tuple, but force relcache inval anyway */
- CacheInvalidateRelcacheByTuple(ctup);
+ pgcform->relminmxid = minmulti;
+ dirty = true;
}
+ /* If anything changed, write out the tuple. */
+ if (dirty)
+ heap_inplace_update(rd, ctup);
+
heap_close(rd, RowExclusiveLock);
}
* vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB
*
* Update pg_database's datfrozenxid entry for our database to be the
- * minimum of the pg_class.relfrozenxid values. If we are able to
- * advance pg_database.datfrozenxid, also try to truncate pg_clog.
+ * minimum of the pg_class.relfrozenxid values.
+ *
+ * Similarly, update our datminmxid to be the minimum of the
+ * pg_class.relminmxid values.
+ *
+ * If we are able to advance either pg_database value, also try to
+ * truncate pg_clog and pg_multixact.
*
* We violate transaction semantics here by overwriting the database's
- * existing pg_database tuple with the new value. This is reasonably
+ * existing pg_database tuple with the new value. This is reasonably
* safe since the new value is correct whether or not this transaction
* commits. As with vac_update_relstats, this avoids leaving dead tuples
* behind after a VACUUM.
- *
- * This routine is shared by full and lazy VACUUM.
*/
void
vac_update_datfrozenxid(void)
SysScanDesc scan;
HeapTuple classTup;
TransactionId newFrozenXid;
+ MultiXactId newMinMulti;
bool dirty = false;
/*
- * Initialize the "min" calculation with RecentGlobalXmin. Any
- * not-yet-committed pg_class entries for new tables must have
- * relfrozenxid at least this high, because any other open xact must have
- * RecentXmin >= its PGPROC.xmin >= our RecentGlobalXmin; see
- * AddNewRelationTuple(). So we cannot produce a wrong minimum by
- * starting with this.
+ * Initialize the "min" calculation with GetOldestXmin, which is a
+ * reasonable approximation to the minimum relfrozenxid for not-yet-
+ * committed pg_class entries for new tables; see AddNewRelationTuple().
+ * So we cannot produce a wrong minimum by starting with this.
+ */
+ newFrozenXid = GetOldestXmin(NULL, true);
+
+ /*
+ * Similarly, initialize the MultiXact "min" with the value that would be
+ * used on pg_class for new tables. See AddNewRelationTuple().
*/
- newFrozenXid = RecentGlobalXmin;
+ newMinMulti = GetOldestMultiXactId();
/*
* We must seqscan pg_class to find the minimum Xid, because there is no
relation = heap_open(RelationRelationId, AccessShareLock);
scan = systable_beginscan(relation, InvalidOid, false,
- SnapshotNow, 0, NULL);
+ NULL, 0, NULL);
while ((classTup = systable_getnext(scan)) != NULL)
{
Form_pg_class classForm = (Form_pg_class) GETSTRUCT(classTup);
/*
- * Only consider heap and TOAST tables (anything else should have
- * InvalidTransactionId in relfrozenxid anyway.)
+ * Only consider relations able to hold unfrozen XIDs (anything else
+ * should have InvalidTransactionId in relfrozenxid anyway.)
*/
if (classForm->relkind != RELKIND_RELATION &&
+ classForm->relkind != RELKIND_MATVIEW &&
classForm->relkind != RELKIND_TOASTVALUE)
continue;
Assert(TransactionIdIsNormal(classForm->relfrozenxid));
+ Assert(MultiXactIdIsValid(classForm->relminmxid));
if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid))
newFrozenXid = classForm->relfrozenxid;
+
+ if (MultiXactIdPrecedes(classForm->relminmxid, newMinMulti))
+ newMinMulti = classForm->relminmxid;
}
/* we're done with pg_class */
heap_close(relation, AccessShareLock);
Assert(TransactionIdIsNormal(newFrozenXid));
+ Assert(MultiXactIdIsValid(newMinMulti));
/* Now fetch the pg_database tuple we need to update. */
relation = heap_open(DatabaseRelationId, RowExclusiveLock);
/* Fetch a copy of the tuple to scribble on */
- tuple = SearchSysCacheCopy(DATABASEOID,
- ObjectIdGetDatum(MyDatabaseId),
- 0, 0, 0);
+ tuple = SearchSysCacheCopy1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
if (!HeapTupleIsValid(tuple))
elog(ERROR, "could not find tuple for database %u", MyDatabaseId);
dbform = (Form_pg_database) GETSTRUCT(tuple);
dirty = true;
}
+ /* ditto */
+ if (MultiXactIdPrecedes(dbform->datminmxid, newMinMulti))
+ {
+ dbform->datminmxid = newMinMulti;
+ dirty = true;
+ }
+
if (dirty)
heap_inplace_update(relation, tuple);
heap_close(relation, RowExclusiveLock);
/*
- * If we were able to advance datfrozenxid, mark the flat-file copy of
- * pg_database for update at commit, and see if we can truncate pg_clog.
+ * If we were able to advance datfrozenxid, see if we can truncate
+ * pg_clog. Also do it if the shared XID-wrap-limit info is stale, since
+ * this action will update that too.
*/
- if (dirty)
- {
- database_file_update_needed();
- vac_truncate_clog(newFrozenXid);
- }
+ if (dirty || ForceTransactionIdLimitUpdate())
+ vac_truncate_clog(newFrozenXid, newMinMulti);
}
* Also update the XID wrap limit info maintained by varsup.c.
*
* The passed XID is simply the one I just wrote into my pg_database
- * entry. It's used to initialize the "min" calculation.
+ * entry. It's used to initialize the "min" calculation.
*
- * This routine is shared by full and lazy VACUUM. Note that it's
- * only invoked when we've managed to change our DB's datfrozenxid
- * entry.
+ * This routine is only invoked when we've managed to change our
+ * DB's datfrozenxid entry, or we found that the shared XID-wrap-limit
+ * info is stale.
*/
static void
-vac_truncate_clog(TransactionId frozenXID)
+vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti)
{
TransactionId myXID = GetCurrentTransactionId();
Relation relation;
HeapScanDesc scan;
HeapTuple tuple;
- NameData oldest_datname;
+ Oid oldestxid_datoid;
+ Oid minmulti_datoid;
bool frozenAlreadyWrapped = false;
- /* init oldest_datname to sync with my frozenXID */
- namestrcpy(&oldest_datname, get_database_name(MyDatabaseId));
+ /* init oldest datoids to sync with my frozen values */
+ oldestxid_datoid = MyDatabaseId;
+ minmulti_datoid = MyDatabaseId;
/*
* Scan pg_database to compute the minimum datfrozenxid
*/
relation = heap_open(DatabaseRelationId, AccessShareLock);
- scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
+ scan = heap_beginscan_catalog(relation, 0, NULL);
while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
Assert(TransactionIdIsNormal(dbform->datfrozenxid));
+ Assert(MultiXactIdIsValid(dbform->datminmxid));
if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
frozenAlreadyWrapped = true;
else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
{
frozenXID = dbform->datfrozenxid;
- namecpy(&oldest_datname, &dbform->datname);
+ oldestxid_datoid = HeapTupleGetOid(tuple);
+ }
+
+ if (MultiXactIdPrecedes(dbform->datminmxid, minMulti))
+ {
+ minMulti = dbform->datminmxid;
+ minmulti_datoid = HeapTupleGetOid(tuple);
}
}
return;
}
- /* Truncate CLOG to the oldest frozenxid */
+ /*
+ * Truncate CLOG to the oldest computed value. Note we don't truncate
+ * multixacts; that will be done by the next checkpoint.
+ */
TruncateCLOG(frozenXID);
/*
- * Update the wrap limit for GetNewTransactionId. Note: this function
- * will also signal the postmaster for an(other) autovac cycle if needed.
+ * Update the wrap limit for GetNewTransactionId and creation of new
+ * MultiXactIds. Note: these functions will also signal the postmaster
+ * for an(other) autovac cycle if needed. XXX should we avoid possibly
+ * signalling twice?
*/
- SetTransactionIdLimit(frozenXID, &oldest_datname);
+ SetTransactionIdLimit(frozenXID, oldestxid_datoid);
+ SetMultiXactIdLimit(minMulti, minmulti_datoid);
}
-/****************************************************************************
- * *
- * Code common to both flavors of VACUUM *
- * *
- ****************************************************************************
- */
-
-
/*
* vacuum_rel() -- vacuum one heap relation
*
* Doing one heap at a time incurs extra overhead, since we need to
- * check that the heap exists again just before we vacuum it. The
+ * check that the heap exists again just before we vacuum it. The
* reason that we do this is so that vacuuming can be spread across
* many small transactions. Otherwise, two-phase locking would require
* us to lock the entire database during one pass of the vacuum cleaner.
*
* At entry and exit, we are not inside a transaction.
*/
-static void
-vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind,
- bool for_wraparound)
+static bool
+vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound)
{
LOCKMODE lmode;
Relation onerel;
LockRelId onerelid;
Oid toast_relid;
Oid save_userid;
- bool save_secdefcxt;
+ int save_sec_context;
+ int save_nestlevel;
/* Begin a transaction for vacuuming this relation */
StartTransactionCommand();
- if (vacstmt->full)
- {
- /* functions in indexes may want a snapshot set */
- ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
- }
- else
+ /*
+ * Functions in indexes may want a snapshot set. Also, setting a snapshot
+ * ensures that RecentGlobalXmin is kept truly recent.
+ */
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ if (!(vacstmt->options & VACOPT_FULL))
{
/*
- * During a lazy VACUUM we do not run any user-supplied functions, and
- * so it should be safe to not create a transaction snapshot.
- *
- * We can furthermore set the PROC_IN_VACUUM flag, which lets other
- * concurrent VACUUMs know that they can ignore this one while
+ * In lazy vacuum, we can set the PROC_IN_VACUUM flag, which lets
+ * other concurrent VACUUMs know that they can ignore this one while
* determining their OldestXmin. (The reason we don't set it during a
- * full VACUUM is exactly that we may have to run user- defined
+ * full VACUUM is exactly that we may have to run user-defined
* functions for functional indexes, and we want to make sure that if
* they use the snapshot set above, any tuples it requires can't get
* removed from other tables. An index function that depends on the
* contents of other tables is arguably broken, but we won't break it
* here by violating transaction semantics.)
*
- * We also set the VACUUM_FOR_WRAPAROUND flag, which is passed down
- * by autovacuum; it's used to avoid cancelling a vacuum that was
- * invoked in an emergency.
+ * We also set the VACUUM_FOR_WRAPAROUND flag, which is passed down by
+ * autovacuum; it's used to avoid canceling a vacuum that was invoked
+ * in an emergency.
*
- * Note: this flag remains set until CommitTransaction or
- * AbortTransaction. We don't want to clear it until we reset
- * MyProc->xid/xmin, else OldestXmin might appear to go backwards,
+ * Note: these flags remain set until CommitTransaction or
+ * AbortTransaction. We don't want to clear them until we reset
+ * MyPgXact->xid/xmin, else OldestXmin might appear to go backwards,
* which is probably Not Good.
*/
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
- MyProc->vacuumFlags |= PROC_IN_VACUUM;
+ MyPgXact->vacuumFlags |= PROC_IN_VACUUM;
if (for_wraparound)
- MyProc->vacuumFlags |= PROC_VACUUM_FOR_WRAPAROUND;
+ MyPgXact->vacuumFlags |= PROC_VACUUM_FOR_WRAPAROUND;
LWLockRelease(ProcArrayLock);
}
/*
- * Check for user-requested abort. Note we want this to be inside a
+ * Check for user-requested abort. Note we want this to be inside a
* transaction, so xact.c doesn't issue useless WARNING.
*/
CHECK_FOR_INTERRUPTS();
* vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either
* way, we can be sure that no other backend is vacuuming the same table.
*/
- lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;
+ lmode = (vacstmt->options & VACOPT_FULL) ? AccessExclusiveLock : ShareUpdateExclusiveLock;
/*
* Open the relation and get the appropriate lock on it.
*
* There's a race condition here: the rel may have gone away since the
* last time we saw it. If so, we don't need to vacuum it.
+ *
+ * If we've been asked not to wait for the relation lock, acquire it first
+ * in non-blocking mode, before calling try_relation_open().
*/
- onerel = try_relation_open(relid, lmode);
+ if (!(vacstmt->options & VACOPT_NOWAIT))
+ onerel = try_relation_open(relid, lmode);
+ else if (ConditionalLockRelationOid(relid, lmode))
+ onerel = try_relation_open(relid, NoLock);
+ else
+ {
+ onerel = NULL;
+ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
+ ereport(LOG,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("skipping vacuum of \"%s\" --- lock not available",
+ vacstmt->relation->relname)));
+ }
if (!onerel)
{
+ PopActiveSnapshot();
CommitTransactionCommand();
- return;
+ return false;
}
/*
*
* We allow the user to vacuum a table if he is superuser, the table
* owner, or the database owner (but in the latter case, only if it's not
- * a shared relation). pg_class_ownercheck includes the superuser case.
+ * a shared relation). pg_class_ownercheck includes the superuser case.
*
* Note we choose to treat permissions failure as a WARNING and keep
* trying to vacuum the rest of the DB --- is this appropriate?
{
if (onerel->rd_rel->relisshared)
ereport(WARNING,
- (errmsg("skipping \"%s\" --- only superuser can vacuum it",
- RelationGetRelationName(onerel))));
+ (errmsg("skipping \"%s\" --- only superuser can vacuum it",
+ RelationGetRelationName(onerel))));
else if (onerel->rd_rel->relnamespace == PG_CATALOG_NAMESPACE)
ereport(WARNING,
(errmsg("skipping \"%s\" --- only superuser or database owner can vacuum it",
(errmsg("skipping \"%s\" --- only table or database owner can vacuum it",
RelationGetRelationName(onerel))));
relation_close(onerel, lmode);
+ PopActiveSnapshot();
CommitTransactionCommand();
- return;
+ return false;
}
/*
- * Check that it's a plain table; we used to do this in get_rel_oids() but
- * seems safer to check after we've locked the relation.
+ * Check that it's a vacuumable relation; we used to do this in
+ * get_rel_oids() but seems safer to check after we've locked the
+ * relation.
*/
- if (onerel->rd_rel->relkind != expected_relkind)
+ if (onerel->rd_rel->relkind != RELKIND_RELATION &&
+ onerel->rd_rel->relkind != RELKIND_MATVIEW &&
+ onerel->rd_rel->relkind != RELKIND_TOASTVALUE)
{
ereport(WARNING,
- (errmsg("skipping \"%s\" --- cannot vacuum indexes, views, or special system tables",
+ (errmsg("skipping \"%s\" --- cannot vacuum non-tables or special system tables",
RelationGetRelationName(onerel))));
relation_close(onerel, lmode);
+ PopActiveSnapshot();
CommitTransactionCommand();
- return;
+ return false;
}
/*
* warning here; it would just lead to chatter during a database-wide
* VACUUM.)
*/
- if (isOtherTempNamespace(RelationGetNamespace(onerel)))
+ if (RELATION_IS_OTHER_TEMP(onerel))
{
relation_close(onerel, lmode);
+ PopActiveSnapshot();
CommitTransactionCommand();
- return;
+ return false;
}
/*
LockRelationIdForSession(&onerelid, lmode);
/*
- * Remember the relation's TOAST relation for later
+ * Remember the relation's TOAST relation for later, if the caller asked
+ * us to process it. In VACUUM FULL, though, the toast table is
+ * automatically rebuilt by cluster_rel so we shouldn't recurse to it.
*/
- toast_relid = onerel->rd_rel->reltoastrelid;
+ if (do_toast && !(vacstmt->options & VACOPT_FULL))
+ toast_relid = onerel->rd_rel->reltoastrelid;
+ else
+ toast_relid = InvalidOid;
/*
- * Switch to the table owner's userid, so that any index functions are
- * run as that user. (This is unnecessary, but harmless, for lazy
- * VACUUM.)
+ * Switch to the table owner's userid, so that any index functions are run
+ * as that user. Also lock down security-restricted operations and
+ * arrange to make GUC variable changes local to this command. (This is
+ * unnecessary, but harmless, for lazy VACUUM.)
*/
- GetUserIdAndContext(&save_userid, &save_secdefcxt);
- SetUserIdAndContext(onerel->rd_rel->relowner, true);
+ GetUserIdAndSecContext(&save_userid, &save_sec_context);
+ SetUserIdAndSecContext(onerel->rd_rel->relowner,
+ save_sec_context | SECURITY_RESTRICTED_OPERATION);
+ save_nestlevel = NewGUCNestLevel();
/*
* Do the actual work --- either FULL or "lazy" vacuum
*/
- if (vacstmt->full)
- full_vacuum_rel(onerel, vacstmt);
+ if (vacstmt->options & VACOPT_FULL)
+ {
+ /* close relation before vacuuming, but hold lock until commit */
+ relation_close(onerel, NoLock);
+ onerel = NULL;
+
+ /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */
+ cluster_rel(relid, InvalidOid, false,
+ (vacstmt->options & VACOPT_VERBOSE) != 0);
+ }
else
lazy_vacuum_rel(onerel, vacstmt, vac_strategy);
- /* Restore userid */
- SetUserIdAndContext(save_userid, save_secdefcxt);
+ /* Roll back any GUC changes executed by index functions */
+ AtEOXact_GUC(false, save_nestlevel);
+
+ /* Restore userid and security context */
+ SetUserIdAndSecContext(save_userid, save_sec_context);
/* all done with this class, but hold lock until commit */
- relation_close(onerel, NoLock);
+ if (onerel)
+ relation_close(onerel, NoLock);
/*
* Complete the transaction and free all temporary memory used.
*/
+ PopActiveSnapshot();
CommitTransactionCommand();
/*
* If the relation has a secondary toast rel, vacuum that too while we
* still hold the session lock on the master table. Note however that
- * "analyze" will not get done on the toast table. This is good, because
+ * "analyze" will not get done on the toast table. This is good, because
* the toaster always uses hardcoded index access and statistics are
* totally unimportant for toast relations.
*/
if (toast_relid != InvalidOid)
- vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE, for_wraparound);
+ vacuum_rel(toast_relid, vacstmt, false, for_wraparound);
/*
* Now release the session-level lock on the master table.
*/
UnlockRelationIdForSession(&onerelid, lmode);
-}
-
-/****************************************************************************
- * *
- * Code for VACUUM FULL (only) *
- * *
- ****************************************************************************
- */
+ /* Report that we really did it. */
+ return true;
+}
/*
- * full_vacuum_rel() -- perform FULL VACUUM for one heap relation
- *
- * This routine vacuums a single heap, cleans out its indexes, and
- * updates its num_pages and num_tuples statistics.
+ * Open all the vacuumable indexes of the given relation, obtaining the
+ * specified kind of lock on each. Return an array of Relation pointers for
+ * the indexes into *Irel, and the number of indexes into *nindexes.
*
- * At entry, we have already established a transaction and opened
- * and locked the relation.
+ * We consider an index vacuumable if it is marked insertable (IndexIsReady).
+ * If it isn't, probably a CREATE INDEX CONCURRENTLY command failed early in
+ * execution, and what we have is too corrupt to be processable. We will
+ * vacuum even if the index isn't indisvalid; this is important because in a
+ * unique index, uniqueness checks will be performed anyway and had better not
+ * hit dangling index pointers.
*/
-static void
-full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
+void
+vac_open_indexes(Relation relation, LOCKMODE lockmode,
+ int *nindexes, Relation **Irel)
{
- VacPageListData vacuum_pages; /* List of pages to vacuum and/or
- * clean indexes */
- VacPageListData fraged_pages; /* List of pages with space enough for
- * re-using */
- Relation *Irel;
- int nindexes,
- i;
- VRelStats *vacrelstats;
-
- vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
- &OldestXmin, &FreezeLimit);
+ List *indexoidlist;
+ ListCell *indexoidscan;
+ int i;
- /*
- * Flush any previous async-commit transactions. This does not guarantee
- * that we will be able to set hint bits for tuples they inserted, but it
- * improves the probability, especially in simple sequential-commands
- * cases. See scan_heap() and repair_frag() for more about this.
- */
- XLogAsyncCommitFlush();
+ Assert(lockmode != NoLock);
- /*
- * Set up statistics-gathering machinery.
- */
- vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
- vacrelstats->rel_pages = 0;
- vacrelstats->rel_tuples = 0;
- vacrelstats->rel_indexed_tuples = 0;
- vacrelstats->hasindex = false;
-
- /* scan the heap */
- vacuum_pages.num_pages = fraged_pages.num_pages = 0;
- scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
-
- /* Now open all indexes of the relation */
- vac_open_indexes(onerel, AccessExclusiveLock, &nindexes, &Irel);
- if (nindexes > 0)
- vacrelstats->hasindex = true;
-
- /* Clean/scan index relation(s) */
- if (Irel != NULL)
- {
- if (vacuum_pages.num_pages > 0)
- {
- for (i = 0; i < nindexes; i++)
- vacuum_index(&vacuum_pages, Irel[i],
- vacrelstats->rel_indexed_tuples, 0);
- }
- else
- {
- /* just scan indexes to update statistic */
- for (i = 0; i < nindexes; i++)
- scan_index(Irel[i], vacrelstats->rel_indexed_tuples);
- }
- }
+ indexoidlist = RelationGetIndexList(relation);
- if (fraged_pages.num_pages > 0)
- {
- /* Try to shrink heap */
- repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
- nindexes, Irel);
- vac_close_indexes(nindexes, Irel, NoLock);
- }
+ /* allocate enough memory for all indexes */
+ i = list_length(indexoidlist);
+
+ if (i > 0)
+ *Irel = (Relation *) palloc(i * sizeof(Relation));
else
+ *Irel = NULL;
+
+ /* collect just the ready indexes */
+ i = 0;
+ foreach(indexoidscan, indexoidlist)
{
- vac_close_indexes(nindexes, Irel, NoLock);
- if (vacuum_pages.num_pages > 0)
- {
- /* Clean pages from vacuum_pages list */
- vacuum_heap(vacrelstats, onerel, &vacuum_pages);
- }
- }
+ Oid indexoid = lfirst_oid(indexoidscan);
+ Relation indrel;
- /* update shared free space map with final free space info */
- vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
+ indrel = index_open(indexoid, lockmode);
+ if (IndexIsReady(indrel->rd_index))
+ (*Irel)[i++] = indrel;
+ else
+ index_close(indrel, lockmode);
+ }
- /* update statistics in pg_class */
- vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
- vacrelstats->rel_tuples, vacrelstats->hasindex,
- FreezeLimit);
+ *nindexes = i;
- /* report results to the stats collector, too */
- pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
- vacstmt->analyze, vacrelstats->rel_tuples);
+ list_free(indexoidlist);
}
-
/*
- * scan_heap() -- scan an open heap relation
- *
- * This routine sets commit status bits, constructs vacuum_pages (list
- * of pages we need to compact free space on and/or clean indexes of
- * deleted tuples), constructs fraged_pages (list of pages with free
- * space that tuples could be moved into), and calculates statistics
- * on the number of live tuples in the heap.
+ * Release the resources acquired by vac_open_indexes. Optionally release
+ * the locks (say NoLock to keep 'em).
*/
-static void
-scan_heap(VRelStats *vacrelstats, Relation onerel,
- VacPageList vacuum_pages, VacPageList fraged_pages)
-{
- BlockNumber nblocks,
- blkno;
- char *relname;
- VacPage vacpage;
- BlockNumber empty_pages,
- empty_end_pages;
- double num_tuples,
- num_indexed_tuples,
- tups_vacuumed,
- nkeep,
- nunused;
- double free_space,
- usable_free_space;
- Size min_tlen = MaxHeapTupleSize;
- Size max_tlen = 0;
- bool do_shrinking = true;
- VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
- int num_vtlinks = 0;
- int free_vtlinks = 100;
- PGRUsage ru0;
-
- pg_rusage_init(&ru0);
-
- relname = RelationGetRelationName(onerel);
- ereport(elevel,
- (errmsg("vacuuming \"%s.%s\"",
- get_namespace_name(RelationGetNamespace(onerel)),
- relname)));
-
- empty_pages = empty_end_pages = 0;
- num_tuples = num_indexed_tuples = tups_vacuumed = nkeep = nunused = 0;
- free_space = 0;
-
- nblocks = RelationGetNumberOfBlocks(onerel);
-
- /*
- * We initially create each VacPage item in a maximal-sized workspace,
- * then copy the workspace into a just-large-enough copy.
- */
- vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
-
- for (blkno = 0; blkno < nblocks; blkno++)
- {
- Page page,
- tempPage = NULL;
- bool do_reap,
- do_frag;
- Buffer buf;
- OffsetNumber offnum,
- maxoff;
- bool notup;
- OffsetNumber frozen[MaxOffsetNumber];
- int nfrozen;
-
- vacuum_delay_point();
-
- buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
- page = BufferGetPage(buf);
-
- /*
- * Since we are holding exclusive lock on the relation, no other
- * backend can be accessing the page; however it is possible that the
- * background writer will try to write the page if it's already marked
- * dirty. To ensure that invalid data doesn't get written to disk, we
- * must take exclusive buffer lock wherever we potentially modify
- * pages. In fact, we insist on cleanup lock so that we can safely
- * call heap_page_prune(). (This might be overkill, since the
- * bgwriter pays no attention to individual tuples, but on the other
- * hand it's unlikely that the bgwriter has this particular page
- * pinned at this instant. So violating the coding rule would buy us
- * little anyway.)
- */
- LockBufferForCleanup(buf);
-
- vacpage->blkno = blkno;
- vacpage->offsets_used = 0;
- vacpage->offsets_free = 0;
-
- if (PageIsNew(page))
- {
- VacPage vacpagecopy;
-
- ereport(WARNING,
- (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
- relname, blkno)));
- PageInit(page, BufferGetPageSize(buf), 0);
- MarkBufferDirty(buf);
- vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
- free_space += vacpage->free;
- empty_pages++;
- empty_end_pages++;
- vacpagecopy = copy_vac_page(vacpage);
- vpage_insert(vacuum_pages, vacpagecopy);
- vpage_insert(fraged_pages, vacpagecopy);
- UnlockReleaseBuffer(buf);
- continue;
- }
-
- if (PageIsEmpty(page))
- {
- VacPage vacpagecopy;
-
- vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
- free_space += vacpage->free;
- empty_pages++;
- empty_end_pages++;
- vacpagecopy = copy_vac_page(vacpage);
- vpage_insert(vacuum_pages, vacpagecopy);
- vpage_insert(fraged_pages, vacpagecopy);
- UnlockReleaseBuffer(buf);
- continue;
- }
-
- /*
- * Prune all HOT-update chains in this page.
- *
- * We use the redirect_move option so that redirecting line pointers
- * get collapsed out; this allows us to not worry about them below.
- *
- * We count tuples removed by the pruning step as removed by VACUUM.
- */
- tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
- true, false);
-
- /*
- * Now scan the page to collect vacuumable items and check for tuples
- * requiring freezing.
- */
- nfrozen = 0;
- notup = true;
- maxoff = PageGetMaxOffsetNumber(page);
- for (offnum = FirstOffsetNumber;
- offnum <= maxoff;
- offnum = OffsetNumberNext(offnum))
- {
- ItemId itemid = PageGetItemId(page, offnum);
- bool tupgone = false;
- HeapTupleData tuple;
-
- /*
- * Collect un-used items too - it's possible to have indexes
- * pointing here after crash. (That's an ancient comment and is
- * likely obsolete with WAL, but we might as well continue to
- * check for such problems.)
- */
- if (!ItemIdIsUsed(itemid))
- {
- vacpage->offsets[vacpage->offsets_free++] = offnum;
- nunused += 1;
- continue;
- }
-
- /*
- * DEAD item pointers are to be vacuumed normally; but we don't
- * count them in tups_vacuumed, else we'd be double-counting (at
- * least in the common case where heap_page_prune() just freed up
- * a non-HOT tuple).
- */
- if (ItemIdIsDead(itemid))
- {
- vacpage->offsets[vacpage->offsets_free++] = offnum;
- continue;
- }
-
- /* Shouldn't have any redirected items anymore */
- if (!ItemIdIsNormal(itemid))
- elog(ERROR, "relation \"%s\" TID %u/%u: unexpected redirect item",
- relname, blkno, offnum);
-
- tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
- tuple.t_len = ItemIdGetLength(itemid);
- ItemPointerSet(&(tuple.t_self), blkno, offnum);
-
- switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
- {
- case HEAPTUPLE_LIVE:
- /* Tuple is good --- but let's do some validity checks */
- if (onerel->rd_rel->relhasoids &&
- !OidIsValid(HeapTupleGetOid(&tuple)))
- elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
- relname, blkno, offnum);
-
- /*
- * The shrinkage phase of VACUUM FULL requires that all
- * live tuples have XMIN_COMMITTED set --- see comments in
- * repair_frag()'s walk-along-page loop. Use of async
- * commit may prevent HeapTupleSatisfiesVacuum from
- * setting the bit for a recently committed tuple. Rather
- * than trying to handle this corner case, we just give up
- * and don't shrink.
- */
- if (do_shrinking &&
- !(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
- {
- ereport(LOG,
- (errmsg("relation \"%s\" TID %u/%u: XMIN_COMMITTED not set for transaction %u --- cannot shrink relation",
- relname, blkno, offnum,
- HeapTupleHeaderGetXmin(tuple.t_data))));
- do_shrinking = false;
- }
- break;
- case HEAPTUPLE_DEAD:
-
- /*
- * Ordinarily, DEAD tuples would have been removed by
- * heap_page_prune(), but it's possible that the tuple
- * state changed since heap_page_prune() looked. In
- * particular an INSERT_IN_PROGRESS tuple could have
- * changed to DEAD if the inserter aborted. So this
- * cannot be considered an error condition, though it does
- * suggest that someone released a lock early.
- *
- * If the tuple is HOT-updated then it must only be
- * removed by a prune operation; so we keep it as if it
- * were RECENTLY_DEAD, and abandon shrinking. (XXX is it
- * worth trying to make the shrinking code smart enough to
- * handle this? It's an unusual corner case.)
- *
- * DEAD heap-only tuples can safely be removed if they
- * aren't themselves HOT-updated, although this is a bit
- * inefficient since we'll uselessly try to remove index
- * entries for them.
- */
- if (HeapTupleIsHotUpdated(&tuple))
- {
- nkeep += 1;
- if (do_shrinking)
- ereport(LOG,
- (errmsg("relation \"%s\" TID %u/%u: dead HOT-updated tuple --- cannot shrink relation",
- relname, blkno, offnum)));
- do_shrinking = false;
- }
- else
- {
- tupgone = true; /* we can delete the tuple */
-
- /*
- * We need not require XMIN_COMMITTED or
- * XMAX_COMMITTED to be set, since we will remove the
- * tuple without any further examination of its hint
- * bits.
- */
- }
- break;
- case HEAPTUPLE_RECENTLY_DEAD:
-
- /*
- * If tuple is recently deleted then we must not remove it
- * from relation.
- */
- nkeep += 1;
-
- /*
- * As with the LIVE case, shrinkage requires
- * XMIN_COMMITTED to be set.
- */
- if (do_shrinking &&
- !(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
- {
- ereport(LOG,
- (errmsg("relation \"%s\" TID %u/%u: XMIN_COMMITTED not set for transaction %u --- cannot shrink relation",
- relname, blkno, offnum,
- HeapTupleHeaderGetXmin(tuple.t_data))));
- do_shrinking = false;
- }
-
- /*
- * If we do shrinking and this tuple is updated one then
- * remember it to construct updated tuple dependencies.
- */
- if (do_shrinking &&
- !(ItemPointerEquals(&(tuple.t_self),
- &(tuple.t_data->t_ctid))))
- {
- if (free_vtlinks == 0)
- {
- free_vtlinks = 1000;
- vtlinks = (VTupleLink) repalloc(vtlinks,
- (free_vtlinks + num_vtlinks) *
- sizeof(VTupleLinkData));
- }
- vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
- vtlinks[num_vtlinks].this_tid = tuple.t_self;
- free_vtlinks--;
- num_vtlinks++;
- }
- break;
- case HEAPTUPLE_INSERT_IN_PROGRESS:
-
- /*
- * This should not happen, since we hold exclusive lock on
- * the relation; shouldn't we raise an error? (Actually,
- * it can happen in system catalogs, since we tend to
- * release write lock before commit there.) As above, we
- * can't apply repair_frag() if the tuple state is
- * uncertain.
- */
- if (do_shrinking)
- ereport(LOG,
- (errmsg("relation \"%s\" TID %u/%u: InsertTransactionInProgress %u --- cannot shrink relation",
- relname, blkno, offnum,
- HeapTupleHeaderGetXmin(tuple.t_data))));
- do_shrinking = false;
- break;
- case HEAPTUPLE_DELETE_IN_PROGRESS:
-
- /*
- * This should not happen, since we hold exclusive lock on
- * the relation; shouldn't we raise an error? (Actually,
- * it can happen in system catalogs, since we tend to
- * release write lock before commit there.) As above, we
- * can't apply repair_frag() if the tuple state is
- * uncertain.
- */
- if (do_shrinking)
- ereport(LOG,
- (errmsg("relation \"%s\" TID %u/%u: DeleteTransactionInProgress %u --- cannot shrink relation",
- relname, blkno, offnum,
- HeapTupleHeaderGetXmax(tuple.t_data))));
- do_shrinking = false;
- break;
- default:
- elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
- break;
- }
-
- if (tupgone)
- {
- ItemId lpp;
-
- /*
- * Here we are building a temporary copy of the page with dead
- * tuples removed. Below we will apply
- * PageRepairFragmentation to the copy, so that we can
- * determine how much space will be available after removal of
- * dead tuples. But note we are NOT changing the real page
- * yet...
- */
- if (tempPage == NULL)
- {
- Size pageSize;
-
- pageSize = PageGetPageSize(page);
- tempPage = (Page) palloc(pageSize);
- memcpy(tempPage, page, pageSize);
- }
-
- /* mark it unused on the temp page */
- lpp = PageGetItemId(tempPage, offnum);
- ItemIdSetUnused(lpp);
-
- vacpage->offsets[vacpage->offsets_free++] = offnum;
- tups_vacuumed += 1;
- }
- else
- {
- num_tuples += 1;
- if (!HeapTupleIsHeapOnly(&tuple))
- num_indexed_tuples += 1;
- notup = false;
- if (tuple.t_len < min_tlen)
- min_tlen = tuple.t_len;
- if (tuple.t_len > max_tlen)
- max_tlen = tuple.t_len;
-
- /*
- * Each non-removable tuple must be checked to see if it needs
- * freezing.
- */
- if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
- InvalidBuffer))
- frozen[nfrozen++] = offnum;
- }
- } /* scan along page */
-
- if (tempPage != NULL)
- {
- /* Some tuples are removable; figure free space after removal */
- PageRepairFragmentation(tempPage);
- vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage);
- pfree(tempPage);
- do_reap = true;
- }
- else
- {
- /* Just use current available space */
- vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
- /* Need to reap the page if it has UNUSED or DEAD line pointers */
- do_reap = (vacpage->offsets_free > 0);
- }
-
- free_space += vacpage->free;
-
- /*
- * Add the page to vacuum_pages if it requires reaping, and add it to
- * fraged_pages if it has a useful amount of free space. "Useful"
- * means enough for a minimal-sized tuple. But we don't know that
- * accurately near the start of the relation, so add pages
- * unconditionally if they have >= BLCKSZ/10 free space. Also
- * forcibly add pages with no live tuples, to avoid confusing the
- * empty_end_pages logic. (In the presence of unreasonably small
- * fillfactor, it seems possible that such pages might not pass
- * the free-space test, but they had better be in the list anyway.)
- */
- do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10 ||
- notup);
-
- if (do_reap || do_frag)
- {
- VacPage vacpagecopy = copy_vac_page(vacpage);
-
- if (do_reap)
- vpage_insert(vacuum_pages, vacpagecopy);
- if (do_frag)
- vpage_insert(fraged_pages, vacpagecopy);
- }
-
- /*
- * Include the page in empty_end_pages if it will be empty after
- * vacuuming; this is to keep us from using it as a move destination.
- * Note that such pages are guaranteed to be in fraged_pages.
- */
- if (notup)
- {
- empty_pages++;
- empty_end_pages++;
- }
- else
- empty_end_pages = 0;
-
- /*
- * If we froze any tuples, mark the buffer dirty, and write a WAL
- * record recording the changes. We must log the changes to be
- * crash-safe against future truncation of CLOG.
- */
- if (nfrozen > 0)
- {
- MarkBufferDirty(buf);
- /* no XLOG for temp tables, though */
- if (!onerel->rd_istemp)
- {
- XLogRecPtr recptr;
-
- recptr = log_heap_freeze(onerel, buf, FreezeLimit,
- frozen, nfrozen);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
- }
-
- UnlockReleaseBuffer(buf);
- }
-
- pfree(vacpage);
-
- /* save stats in the rel list for use later */
- vacrelstats->rel_tuples = num_tuples;
- vacrelstats->rel_indexed_tuples = num_indexed_tuples;
- vacrelstats->rel_pages = nblocks;
- if (num_tuples == 0)
- min_tlen = max_tlen = 0;
- vacrelstats->min_tlen = min_tlen;
- vacrelstats->max_tlen = max_tlen;
-
- vacuum_pages->empty_end_pages = empty_end_pages;
- fraged_pages->empty_end_pages = empty_end_pages;
-
- /*
- * Clear the fraged_pages list if we found we couldn't shrink. Else,
- * remove any "empty" end-pages from the list, and compute usable free
- * space = free space in remaining pages.
- */
- if (do_shrinking)
- {
- int i;
-
- Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
- fraged_pages->num_pages -= empty_end_pages;
- usable_free_space = 0;
- for (i = 0; i < fraged_pages->num_pages; i++)
- usable_free_space += fraged_pages->pagedesc[i]->free;
- }
- else
- {
- fraged_pages->num_pages = 0;
- usable_free_space = 0;
- }
-
- /* don't bother to save vtlinks if we will not call repair_frag */
- if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
- {
- qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
- vac_cmp_vtlinks);
- vacrelstats->vtlinks = vtlinks;
- vacrelstats->num_vtlinks = num_vtlinks;
- }
- else
- {
- vacrelstats->vtlinks = NULL;
- vacrelstats->num_vtlinks = 0;
- pfree(vtlinks);
- }
-
- ereport(elevel,
- (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
- RelationGetRelationName(onerel),
- tups_vacuumed, num_tuples, nblocks),
- errdetail("%.0f dead row versions cannot be removed yet.\n"
- "Nonremovable row versions range from %lu to %lu bytes long.\n"
- "There were %.0f unused item pointers.\n"
- "Total free space (including removable row versions) is %.0f bytes.\n"
- "%u pages are or will become empty, including %u at the end of the table.\n"
- "%u pages containing %.0f free bytes are potential move destinations.\n"
- "%s.",
- nkeep,
- (unsigned long) min_tlen, (unsigned long) max_tlen,
- nunused,
- free_space,
- empty_pages, empty_end_pages,
- fraged_pages->num_pages, usable_free_space,
- pg_rusage_show(&ru0))));
-}
-
-
-/*
- * repair_frag() -- try to repair relation's fragmentation
- *
- * This routine marks dead tuples as unused and tries re-use dead space
- * by moving tuples (and inserting indexes if needed). It constructs
- * Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
- * for them after committing (in hack-manner - without losing locks
- * and freeing memory!) current transaction. It truncates relation
- * if some end-blocks are gone away.
- */
-static void
-repair_frag(VRelStats *vacrelstats, Relation onerel,
- VacPageList vacuum_pages, VacPageList fraged_pages,
- int nindexes, Relation *Irel)
-{
- TransactionId myXID = GetCurrentTransactionId();
- Buffer dst_buffer = InvalidBuffer;
- BlockNumber nblocks,
- blkno;
- BlockNumber last_move_dest_block = 0,
- last_vacuum_block;
- Page dst_page = NULL;
- ExecContextData ec;
- VacPageListData Nvacpagelist;
- VacPage dst_vacpage = NULL,
- last_vacuum_page,
- vacpage,
- *curpage;
- int i;
- int num_moved = 0,
- num_fraged_pages,
- vacuumed_pages;
- int keep_tuples = 0;
- int keep_indexed_tuples = 0;
- PGRUsage ru0;
-
- pg_rusage_init(&ru0);
-
- ExecContext_Init(&ec, onerel);
-
- Nvacpagelist.num_pages = 0;
- num_fraged_pages = fraged_pages->num_pages;
- Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
- vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
- if (vacuumed_pages > 0)
- {
- /* get last reaped page from vacuum_pages */
- last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
- last_vacuum_block = last_vacuum_page->blkno;
- }
- else
- {
- last_vacuum_page = NULL;
- last_vacuum_block = InvalidBlockNumber;
- }
-
- vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
- vacpage->offsets_used = vacpage->offsets_free = 0;
-
- /*
- * Scan pages backwards from the last nonempty page, trying to move tuples
- * down to lower pages. Quit when we reach a page that we have moved any
- * tuples onto, or the first page if we haven't moved anything, or when we
- * find a page we cannot completely empty (this last condition is handled
- * by "break" statements within the loop).
- *
- * NB: this code depends on the vacuum_pages and fraged_pages lists being
- * in order by blkno.
- */
- nblocks = vacrelstats->rel_pages;
- for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
- blkno > last_move_dest_block;
- blkno--)
- {
- Buffer buf;
- Page page;
- OffsetNumber offnum,
- maxoff;
- bool isempty,
- chain_tuple_moved;
-
- vacuum_delay_point();
-
- /*
- * Forget fraged_pages pages at or after this one; they're no longer
- * useful as move targets, since we only want to move down. Note that
- * since we stop the outer loop at last_move_dest_block, pages removed
- * here cannot have had anything moved onto them already.
- *
- * Also note that we don't change the stored fraged_pages list, only
- * our local variable num_fraged_pages; so the forgotten pages are
- * still available to be loaded into the free space map later.
- */
- while (num_fraged_pages > 0 &&
- fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
- {
- Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
- --num_fraged_pages;
- }
-
- /*
- * Process this page of relation.
- */
- buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
- page = BufferGetPage(buf);
-
- vacpage->offsets_free = 0;
-
- isempty = PageIsEmpty(page);
-
- /* Is the page in the vacuum_pages list? */
- if (blkno == last_vacuum_block)
- {
- if (last_vacuum_page->offsets_free > 0)
- {
- /* there are dead tuples on this page - clean them */
- Assert(!isempty);
- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
- vacuum_page(onerel, buf, last_vacuum_page);
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- }
- else
- Assert(isempty);
- --vacuumed_pages;
- if (vacuumed_pages > 0)
- {
- /* get prev reaped page from vacuum_pages */
- last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
- last_vacuum_block = last_vacuum_page->blkno;
- }
- else
- {
- last_vacuum_page = NULL;
- last_vacuum_block = InvalidBlockNumber;
- }
- if (isempty)
- {
- ReleaseBuffer(buf);
- continue;
- }
- }
- else
- Assert(!isempty);
-
- chain_tuple_moved = false; /* no one chain-tuple was moved off
- * this page, yet */
- vacpage->blkno = blkno;
- maxoff = PageGetMaxOffsetNumber(page);
- for (offnum = FirstOffsetNumber;
- offnum <= maxoff;
- offnum = OffsetNumberNext(offnum))
- {
- Size tuple_len;
- HeapTupleData tuple;
- ItemId itemid = PageGetItemId(page, offnum);
-
- if (!ItemIdIsUsed(itemid))
- continue;
-
- if (ItemIdIsDead(itemid))
- {
- /* just remember it for vacuum_page() */
- vacpage->offsets[vacpage->offsets_free++] = offnum;
- continue;
- }
-
- /* Shouldn't have any redirected items now */
- Assert(ItemIdIsNormal(itemid));
-
- tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
- tuple_len = tuple.t_len = ItemIdGetLength(itemid);
- ItemPointerSet(&(tuple.t_self), blkno, offnum);
-
- /* ---
- * VACUUM FULL has an exclusive lock on the relation. So
- * normally no other transaction can have pending INSERTs or
- * DELETEs in this relation. A tuple is either:
- * (a) live (XMIN_COMMITTED)
- * (b) known dead (XMIN_INVALID, or XMAX_COMMITTED and xmax
- * is visible to all active transactions)
- * (c) inserted and deleted (XMIN_COMMITTED+XMAX_COMMITTED)
- * but at least one active transaction does not see the
- * deleting transaction (ie, it's RECENTLY_DEAD)
- * (d) moved by the currently running VACUUM
- * (e) inserted or deleted by a not yet committed transaction,
- * or by a transaction we couldn't set XMIN_COMMITTED for.
- * In case (e) we wouldn't be in repair_frag() at all, because
- * scan_heap() detects those cases and shuts off shrinking.
- * We can't see case (b) here either, because such tuples were
- * already removed by vacuum_page(). Cases (a) and (c) are
- * normal and will have XMIN_COMMITTED set. Case (d) is only
- * possible if a whole tuple chain has been moved while
- * processing this or a higher numbered block.
- * ---
- */
- if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
- {
- if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
- elog(ERROR, "HEAP_MOVED_IN was not expected");
- if (!(tuple.t_data->t_infomask & HEAP_MOVED_OFF))
- elog(ERROR, "HEAP_MOVED_OFF was expected");
-
- /*
- * MOVED_OFF by another VACUUM would have caused the
- * visibility check to set XMIN_COMMITTED or XMIN_INVALID.
- */
- if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
- elog(ERROR, "invalid XVAC in tuple header");
-
- /*
- * If this (chain) tuple is moved by me already then I have to
- * check is it in vacpage or not - i.e. is it moved while
- * cleaning this page or some previous one.
- */
-
- /* Can't we Assert(keep_tuples > 0) here? */
- if (keep_tuples == 0)
- continue;
- if (chain_tuple_moved)
- {
- /* some chains were moved while cleaning this page */
- Assert(vacpage->offsets_free > 0);
- for (i = 0; i < vacpage->offsets_free; i++)
- {
- if (vacpage->offsets[i] == offnum)
- break;
- }
- if (i >= vacpage->offsets_free) /* not found */
- {
- vacpage->offsets[vacpage->offsets_free++] = offnum;
-
- /*
- * If this is not a heap-only tuple, there must be an
- * index entry for this item which will be removed in
- * the index cleanup. Decrement the
- * keep_indexed_tuples count to remember this.
- */
- if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
- keep_indexed_tuples--;
- keep_tuples--;
- }
- }
- else
- {
- vacpage->offsets[vacpage->offsets_free++] = offnum;
-
- /*
- * If this is not a heap-only tuple, there must be an
- * index entry for this item which will be removed in the
- * index cleanup. Decrement the keep_indexed_tuples count
- * to remember this.
- */
- if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
- keep_indexed_tuples--;
- keep_tuples--;
- }
- continue;
- }
-
- /*
- * If this tuple is in a chain of tuples created in updates by
- * "recent" transactions then we have to move the whole chain of
- * tuples to other places, so that we can write new t_ctid links
- * that preserve the chain relationship.
- *
- * This test is complicated. Read it as "if tuple is a recently
- * created updated version, OR if it is an obsoleted version". (In
- * the second half of the test, we needn't make any check on XMAX
- * --- it must be recently obsoleted, else scan_heap would have
- * deemed it removable.)
- *
- * NOTE: this test is not 100% accurate: it is possible for a
- * tuple to be an updated one with recent xmin, and yet not match
- * any new_tid entry in the vtlinks list. Presumably there was
- * once a parent tuple with xmax matching the xmin, but it's
- * possible that that tuple has been removed --- for example, if
- * it had xmin = xmax and wasn't itself an updated version, then
- * HeapTupleSatisfiesVacuum would deem it removable as soon as the
- * xmin xact completes.
- *
- * To be on the safe side, we abandon the repair_frag process if
- * we cannot find the parent tuple in vtlinks. This may be overly
- * conservative; AFAICS it would be safe to move the chain.
- *
- * Also, because we distinguish DEAD and RECENTLY_DEAD tuples
- * using OldestXmin, which is a rather coarse test, it is quite
- * possible to have an update chain in which a tuple we think is
- * RECENTLY_DEAD links forward to one that is definitely DEAD. In
- * such a case the RECENTLY_DEAD tuple must actually be dead, but
- * it seems too complicated to try to make VACUUM remove it. We
- * treat each contiguous set of RECENTLY_DEAD tuples as a
- * separately movable chain, ignoring any intervening DEAD ones.
- */
- if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
- !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
- OldestXmin)) ||
- (!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
- HEAP_IS_LOCKED)) &&
- !(ItemPointerEquals(&(tuple.t_self),
- &(tuple.t_data->t_ctid)))))
- {
- Buffer Cbuf = buf;
- bool freeCbuf = false;
- bool chain_move_failed = false;
- bool moved_target = false;
- ItemPointerData Ctid;
- HeapTupleData tp = tuple;
- Size tlen = tuple_len;
- VTupleMove vtmove;
- int num_vtmove;
- int free_vtmove;
- VacPage to_vacpage = NULL;
- int to_item = 0;
- int ti;
-
- if (dst_buffer != InvalidBuffer)
- {
- ReleaseBuffer(dst_buffer);
- dst_buffer = InvalidBuffer;
- }
-
- /* Quick exit if we have no vtlinks to search in */
- if (vacrelstats->vtlinks == NULL)
- {
- elog(DEBUG2, "parent item in update-chain not found --- cannot continue repair_frag");
- break; /* out of walk-along-page loop */
- }
-
- /*
- * If this tuple is in the begin/middle of the chain then we
- * have to move to the end of chain. As with any t_ctid
- * chase, we have to verify that each new tuple is really the
- * descendant of the tuple we came from; however, here we need
- * even more than the normal amount of paranoia. If t_ctid
- * links forward to a tuple determined to be DEAD, then
- * depending on where that tuple is, it might already have
- * been removed, and perhaps even replaced by a MOVED_IN
- * tuple. We don't want to include any DEAD tuples in the
- * chain, so we have to recheck HeapTupleSatisfiesVacuum.
- */
- while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
- HEAP_IS_LOCKED)) &&
- !(ItemPointerEquals(&(tp.t_self),
- &(tp.t_data->t_ctid))))
- {
- ItemPointerData nextTid;
- TransactionId priorXmax;
- Buffer nextBuf;
- Page nextPage;
- OffsetNumber nextOffnum;
- ItemId nextItemid;
- HeapTupleHeader nextTdata;
- HTSV_Result nextTstatus;
-
- nextTid = tp.t_data->t_ctid;
- priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
- /* assume block# is OK (see heap_fetch comments) */
- nextBuf = ReadBufferWithStrategy(onerel,
- ItemPointerGetBlockNumber(&nextTid),
- vac_strategy);
- nextPage = BufferGetPage(nextBuf);
- /* If bogus or unused slot, assume tp is end of chain */
- nextOffnum = ItemPointerGetOffsetNumber(&nextTid);
- if (nextOffnum < FirstOffsetNumber ||
- nextOffnum > PageGetMaxOffsetNumber(nextPage))
- {
- ReleaseBuffer(nextBuf);
- break;
- }
- nextItemid = PageGetItemId(nextPage, nextOffnum);
- if (!ItemIdIsNormal(nextItemid))
- {
- ReleaseBuffer(nextBuf);
- break;
- }
- /* if not matching XMIN, assume tp is end of chain */
- nextTdata = (HeapTupleHeader) PageGetItem(nextPage,
- nextItemid);
- if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata),
- priorXmax))
- {
- ReleaseBuffer(nextBuf);
- break;
- }
-
- /*
- * Must check for DEAD or MOVED_IN tuple, too. This could
- * potentially update hint bits, so we'd better hold the
- * buffer content lock.
- */
- LockBuffer(nextBuf, BUFFER_LOCK_SHARE);
- nextTstatus = HeapTupleSatisfiesVacuum(nextTdata,
- OldestXmin,
- nextBuf);
- if (nextTstatus == HEAPTUPLE_DEAD ||
- nextTstatus == HEAPTUPLE_INSERT_IN_PROGRESS)
- {
- UnlockReleaseBuffer(nextBuf);
- break;
- }
- LockBuffer(nextBuf, BUFFER_LOCK_UNLOCK);
- /* if it's MOVED_OFF we shoulda moved this one with it */
- if (nextTstatus == HEAPTUPLE_DELETE_IN_PROGRESS)
- elog(ERROR, "updated tuple is already HEAP_MOVED_OFF");
- /* OK, switch our attention to the next tuple in chain */
- tp.t_data = nextTdata;
- tp.t_self = nextTid;
- tlen = tp.t_len = ItemIdGetLength(nextItemid);
- if (freeCbuf)
- ReleaseBuffer(Cbuf);
- Cbuf = nextBuf;
- freeCbuf = true;
- }
-
- /* Set up workspace for planning the chain move */
- vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
- num_vtmove = 0;
- free_vtmove = 100;
-
- /*
- * Now, walk backwards up the chain (towards older tuples) and
- * check if all items in chain can be moved. We record all
- * the moves that need to be made in the vtmove array.
- */
- for (;;)
- {
- Buffer Pbuf;
- Page Ppage;
- ItemId Pitemid;
- HeapTupleHeader PTdata;
- VTupleLinkData vtld,
- *vtlp;
-
- /* Identify a target page to move this tuple to */
- if (to_vacpage == NULL ||
- !enough_space(to_vacpage, tlen))
- {
- for (i = 0; i < num_fraged_pages; i++)
- {
- if (enough_space(fraged_pages->pagedesc[i], tlen))
- break;
- }
-
- if (i == num_fraged_pages)
- {
- /* can't move item anywhere */
- chain_move_failed = true;
- break; /* out of check-all-items loop */
- }
- to_item = i;
- to_vacpage = fraged_pages->pagedesc[to_item];
- }
- to_vacpage->free -= MAXALIGN(tlen);
- if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
- to_vacpage->free -= sizeof(ItemIdData);
- (to_vacpage->offsets_used)++;
-
- /* Add an entry to vtmove list */
- if (free_vtmove == 0)
- {
- free_vtmove = 1000;
- vtmove = (VTupleMove)
- repalloc(vtmove,
- (free_vtmove + num_vtmove) *
- sizeof(VTupleMoveData));
- }
- vtmove[num_vtmove].tid = tp.t_self;
- vtmove[num_vtmove].vacpage = to_vacpage;
- if (to_vacpage->offsets_used == 1)
- vtmove[num_vtmove].cleanVpd = true;
- else
- vtmove[num_vtmove].cleanVpd = false;
- free_vtmove--;
- num_vtmove++;
-
- /* Remember if we reached the original target tuple */
- if (ItemPointerGetBlockNumber(&tp.t_self) == blkno &&
- ItemPointerGetOffsetNumber(&tp.t_self) == offnum)
- moved_target = true;
-
- /* Done if at beginning of chain */
- if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
- TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
- OldestXmin))
- break; /* out of check-all-items loop */
-
- /* Move to tuple with prior row version */
- vtld.new_tid = tp.t_self;
- vtlp = (VTupleLink)
- vac_bsearch((void *) &vtld,
- (void *) (vacrelstats->vtlinks),
- vacrelstats->num_vtlinks,
- sizeof(VTupleLinkData),
- vac_cmp_vtlinks);
- if (vtlp == NULL)
- {
- /* see discussion above */
- elog(DEBUG2, "parent item in update-chain not found --- cannot continue repair_frag");
- chain_move_failed = true;
- break; /* out of check-all-items loop */
- }
- tp.t_self = vtlp->this_tid;
- Pbuf = ReadBufferWithStrategy(onerel,
- ItemPointerGetBlockNumber(&(tp.t_self)),
- vac_strategy);
- Ppage = BufferGetPage(Pbuf);
- Pitemid = PageGetItemId(Ppage,
- ItemPointerGetOffsetNumber(&(tp.t_self)));
- /* this can't happen since we saw tuple earlier: */
- if (!ItemIdIsNormal(Pitemid))
- elog(ERROR, "parent itemid marked as unused");
- PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
-
- /* ctid should not have changed since we saved it */
- Assert(ItemPointerEquals(&(vtld.new_tid),
- &(PTdata->t_ctid)));
-
- /*
- * Read above about cases when !ItemIdIsUsed(nextItemid)
- * (child item is removed)... Due to the fact that at the
- * moment we don't remove unuseful part of update-chain,
- * it's possible to get non-matching parent row here. Like
- * as in the case which caused this problem, we stop
- * shrinking here. I could try to find real parent row but
- * want not to do it because of real solution will be
- * implemented anyway, later, and we are too close to 6.5
- * release. - vadim 06/11/99
- */
- if ((PTdata->t_infomask & HEAP_XMAX_IS_MULTI) ||
- !(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata),
- HeapTupleHeaderGetXmin(tp.t_data))))
- {
- ReleaseBuffer(Pbuf);
- elog(DEBUG2, "too old parent tuple found --- cannot continue repair_frag");
- chain_move_failed = true;
- break; /* out of check-all-items loop */
- }
- tp.t_data = PTdata;
- tlen = tp.t_len = ItemIdGetLength(Pitemid);
- if (freeCbuf)
- ReleaseBuffer(Cbuf);
- Cbuf = Pbuf;
- freeCbuf = true;
- } /* end of check-all-items loop */
-
- if (freeCbuf)
- ReleaseBuffer(Cbuf);
- freeCbuf = false;
-
- /* Double-check that we will move the current target tuple */
- if (!moved_target && !chain_move_failed)
- {
- elog(DEBUG2, "failed to chain back to target --- cannot continue repair_frag");
- chain_move_failed = true;
- }
-
- if (chain_move_failed)
- {
- /*
- * Undo changes to offsets_used state. We don't bother
- * cleaning up the amount-free state, since we're not
- * going to do any further tuple motion.
- */
- for (i = 0; i < num_vtmove; i++)
- {
- Assert(vtmove[i].vacpage->offsets_used > 0);
- (vtmove[i].vacpage->offsets_used)--;
- }
- pfree(vtmove);
- break; /* out of walk-along-page loop */
- }
-
- /*
- * Okay, move the whole tuple chain in reverse order.
- *
- * Ctid tracks the new location of the previously-moved tuple.
- */
- ItemPointerSetInvalid(&Ctid);
- for (ti = 0; ti < num_vtmove; ti++)
- {
- VacPage destvacpage = vtmove[ti].vacpage;
- Page Cpage;
- ItemId Citemid;
-
- /* Get page to move from */
- tuple.t_self = vtmove[ti].tid;
- Cbuf = ReadBufferWithStrategy(onerel,
- ItemPointerGetBlockNumber(&(tuple.t_self)),
- vac_strategy);
-
- /* Get page to move to */
- dst_buffer = ReadBufferWithStrategy(onerel,
- destvacpage->blkno,
- vac_strategy);
-
- LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
- if (dst_buffer != Cbuf)
- LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
-
- dst_page = BufferGetPage(dst_buffer);
- Cpage = BufferGetPage(Cbuf);
-
- Citemid = PageGetItemId(Cpage,
- ItemPointerGetOffsetNumber(&(tuple.t_self)));
- tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
- tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
-
- move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
- dst_buffer, dst_page, destvacpage,
- &ec, &Ctid, vtmove[ti].cleanVpd);
-
- /*
- * If the tuple we are moving is a heap-only tuple, this
- * move will generate an additional index entry, so
- * increment the rel_indexed_tuples count.
- */
- if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
- vacrelstats->rel_indexed_tuples++;
-
- num_moved++;
- if (destvacpage->blkno > last_move_dest_block)
- last_move_dest_block = destvacpage->blkno;
-
- /*
- * Remember that we moved tuple from the current page
- * (corresponding index tuple will be cleaned).
- */
- if (Cbuf == buf)
- vacpage->offsets[vacpage->offsets_free++] =
- ItemPointerGetOffsetNumber(&(tuple.t_self));
- else
- {
- /*
- * When we move tuple chains, we may need to move
- * tuples from a block that we haven't yet scanned in
- * the outer walk-along-the-relation loop. Note that
- * we can't be moving a tuple from a block that we
- * have already scanned because if such a tuple
- * exists, then we must have moved the chain along
- * with that tuple when we scanned that block. IOW the
- * test of (Cbuf != buf) guarantees that the tuple we
- * are looking at right now is in a block which is yet
- * to be scanned.
- *
- * We maintain two counters to correctly count the
- * moved-off tuples from blocks that are not yet
- * scanned (keep_tuples) and how many of them have
- * index pointers (keep_indexed_tuples). The main
- * reason to track the latter is to help verify that
- * indexes have the expected number of entries when
- * all the dust settles.
- */
- if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
- keep_indexed_tuples++;
- keep_tuples++;
- }
-
- ReleaseBuffer(dst_buffer);
- ReleaseBuffer(Cbuf);
- } /* end of move-the-tuple-chain loop */
-
- dst_buffer = InvalidBuffer;
- pfree(vtmove);
- chain_tuple_moved = true;
-
- /* advance to next tuple in walk-along-page loop */
- continue;
- } /* end of is-tuple-in-chain test */
-
- /* try to find new page for this tuple */
- if (dst_buffer == InvalidBuffer ||
- !enough_space(dst_vacpage, tuple_len))
- {
- if (dst_buffer != InvalidBuffer)
- {
- ReleaseBuffer(dst_buffer);
- dst_buffer = InvalidBuffer;
- }
- for (i = 0; i < num_fraged_pages; i++)
- {
- if (enough_space(fraged_pages->pagedesc[i], tuple_len))
- break;
- }
- if (i == num_fraged_pages)
- break; /* can't move item anywhere */
- dst_vacpage = fraged_pages->pagedesc[i];
- dst_buffer = ReadBufferWithStrategy(onerel,
- dst_vacpage->blkno,
- vac_strategy);
- LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
- dst_page = BufferGetPage(dst_buffer);
- /* if this page was not used before - clean it */
- if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
- vacuum_page(onerel, dst_buffer, dst_vacpage);
- }
- else
- LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
-
- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-
- move_plain_tuple(onerel, buf, page, &tuple,
- dst_buffer, dst_page, dst_vacpage, &ec);
-
- /*
- * If the tuple we are moving is a heap-only tuple, this move will
- * generate an additional index entry, so increment the
- * rel_indexed_tuples count.
- */
- if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
- vacrelstats->rel_indexed_tuples++;
-
- num_moved++;
- if (dst_vacpage->blkno > last_move_dest_block)
- last_move_dest_block = dst_vacpage->blkno;
-
- /*
- * Remember that we moved tuple from the current page
- * (corresponding index tuple will be cleaned).
- */
- vacpage->offsets[vacpage->offsets_free++] = offnum;
- } /* walk along page */
-
- /*
- * If we broke out of the walk-along-page loop early (ie, still have
- * offnum <= maxoff), then we failed to move some tuple off this page.
- * No point in shrinking any more, so clean up and exit the per-page
- * loop.
- */
- if (offnum < maxoff && keep_tuples > 0)
- {
- OffsetNumber off;
-
- /*
- * Fix vacpage state for any unvisited tuples remaining on page
- */
- for (off = OffsetNumberNext(offnum);
- off <= maxoff;
- off = OffsetNumberNext(off))
- {
- ItemId itemid = PageGetItemId(page, off);
- HeapTupleHeader htup;
-
- if (!ItemIdIsUsed(itemid))
- continue;
- /* Shouldn't be any DEAD or REDIRECT items anymore */
- Assert(ItemIdIsNormal(itemid));
-
- htup = (HeapTupleHeader) PageGetItem(page, itemid);
- if (htup->t_infomask & HEAP_XMIN_COMMITTED)
- continue;
-
- /*
- * See comments in the walk-along-page loop above about why
- * only MOVED_OFF tuples should be found here.
- */
- if (htup->t_infomask & HEAP_MOVED_IN)
- elog(ERROR, "HEAP_MOVED_IN was not expected");
- if (!(htup->t_infomask & HEAP_MOVED_OFF))
- elog(ERROR, "HEAP_MOVED_OFF was expected");
- if (HeapTupleHeaderGetXvac(htup) != myXID)
- elog(ERROR, "invalid XVAC in tuple header");
-
- if (chain_tuple_moved)
- {
- /* some chains were moved while cleaning this page */
- Assert(vacpage->offsets_free > 0);
- for (i = 0; i < vacpage->offsets_free; i++)
- {
- if (vacpage->offsets[i] == off)
- break;
- }
- if (i >= vacpage->offsets_free) /* not found */
- {
- vacpage->offsets[vacpage->offsets_free++] = off;
- Assert(keep_tuples > 0);
-
- /*
- * If this is not a heap-only tuple, there must be an
- * index entry for this item which will be removed in
- * the index cleanup. Decrement the
- * keep_indexed_tuples count to remember this.
- */
- if (!HeapTupleHeaderIsHeapOnly(htup))
- keep_indexed_tuples--;
- keep_tuples--;
- }
- }
- else
- {
- vacpage->offsets[vacpage->offsets_free++] = off;
- Assert(keep_tuples > 0);
- if (!HeapTupleHeaderIsHeapOnly(htup))
- keep_indexed_tuples--;
- keep_tuples--;
- }
- }
- }
-
- if (vacpage->offsets_free > 0) /* some tuples were moved */
- {
- if (chain_tuple_moved) /* else - they are ordered */
- {
- qsort((char *) (vacpage->offsets), vacpage->offsets_free,
- sizeof(OffsetNumber), vac_cmp_offno);
- }
- vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
- }
-
- ReleaseBuffer(buf);
-
- if (offnum <= maxoff)
- break; /* had to quit early, see above note */
-
- } /* walk along relation */
-
- blkno++; /* new number of blocks */
-
- if (dst_buffer != InvalidBuffer)
- {
- Assert(num_moved > 0);
- ReleaseBuffer(dst_buffer);
- }
-
- if (num_moved > 0)
- {
- /*
- * We have to commit our tuple movings before we truncate the
- * relation. Ideally we should do Commit/StartTransactionCommand
- * here, relying on the session-level table lock to protect our
- * exclusive access to the relation. However, that would require a
- * lot of extra code to close and re-open the relation, indexes, etc.
- * For now, a quick hack: record status of current transaction as
- * committed, and continue. We force the commit to be synchronous so
- * that it's down to disk before we truncate. (Note: tqual.c knows
- * that VACUUM FULL always uses sync commit, too.) The transaction
- * continues to be shown as running in the ProcArray.
- *
- * XXX This desperately needs to be revisited. Any failure after this
- * point will result in a PANIC "cannot abort transaction nnn, it was
- * already committed"!
- */
- ForceSyncCommit();
- (void) RecordTransactionCommit();
- }
-
- /*
- * We are not going to move any more tuples across pages, but we still
- * need to apply vacuum_page to compact free space in the remaining pages
- * in vacuum_pages list. Note that some of these pages may also be in the
- * fraged_pages list, and may have had tuples moved onto them; if so, we
- * already did vacuum_page and needn't do it again.
- */
- for (i = 0, curpage = vacuum_pages->pagedesc;
- i < vacuumed_pages;
- i++, curpage++)
- {
- vacuum_delay_point();
-
- Assert((*curpage)->blkno < blkno);
- if ((*curpage)->offsets_used == 0)
- {
- Buffer buf;
- Page page;
-
- /* this page was not used as a move target, so must clean it */
- buf = ReadBufferWithStrategy(onerel,
- (*curpage)->blkno,
- vac_strategy);
- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
- page = BufferGetPage(buf);
- if (!PageIsEmpty(page))
- vacuum_page(onerel, buf, *curpage);
- UnlockReleaseBuffer(buf);
- }
- }
-
- /*
- * Now scan all the pages that we moved tuples onto and update tuple
- * status bits. This is not really necessary, but will save time for
- * future transactions examining these tuples.
- */
- update_hint_bits(onerel, fraged_pages, num_fraged_pages,
- last_move_dest_block, num_moved);
-
- /*
- * It'd be cleaner to make this report at the bottom of this routine, but
- * then the rusage would double-count the second pass of index vacuuming.
- * So do it here and ignore the relatively small amount of processing that
- * occurs below.
- */
- ereport(elevel,
- (errmsg("\"%s\": moved %u row versions, truncated %u to %u pages",
- RelationGetRelationName(onerel),
- num_moved, nblocks, blkno),
- errdetail("%s.",
- pg_rusage_show(&ru0))));
-
- /*
- * Reflect the motion of system tuples to catalog cache here.
- */
- CommandCounterIncrement();
-
- if (Nvacpagelist.num_pages > 0)
- {
- /* vacuum indexes again if needed */
- if (Irel != NULL)
- {
- VacPage *vpleft,
- *vpright,
- vpsave;
-
- /* re-sort Nvacpagelist.pagedesc */
- for (vpleft = Nvacpagelist.pagedesc,
- vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
- vpleft < vpright; vpleft++, vpright--)
- {
- vpsave = *vpleft;
- *vpleft = *vpright;
- *vpright = vpsave;
- }
-
- /*
- * keep_tuples is the number of tuples that have been moved off a
- * page during chain moves but not been scanned over subsequently.
- * The tuple ids of these tuples are not recorded as free offsets
- * for any VacPage, so they will not be cleared from the indexes.
- * keep_indexed_tuples is the portion of these that are expected
- * to have index entries.
- */
- Assert(keep_tuples >= 0);
- for (i = 0; i < nindexes; i++)
- vacuum_index(&Nvacpagelist, Irel[i],
- vacrelstats->rel_indexed_tuples,
- keep_indexed_tuples);
- }
-
- /*
- * Clean moved-off tuples from last page in Nvacpagelist list.
- *
- * We need only do this in this one page, because higher-numbered
- * pages are going to be truncated from the relation entirely. But see
- * comments for update_hint_bits().
- */
- if (vacpage->blkno == (blkno - 1) &&
- vacpage->offsets_free > 0)
- {
- Buffer buf;
- Page page;
- OffsetNumber unused[MaxOffsetNumber];
- OffsetNumber offnum,
- maxoff;
- int uncnt = 0;
- int num_tuples = 0;
-
- buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
- page = BufferGetPage(buf);
- maxoff = PageGetMaxOffsetNumber(page);
- for (offnum = FirstOffsetNumber;
- offnum <= maxoff;
- offnum = OffsetNumberNext(offnum))
- {
- ItemId itemid = PageGetItemId(page, offnum);
- HeapTupleHeader htup;
-
- if (!ItemIdIsUsed(itemid))
- continue;
- /* Shouldn't be any DEAD or REDIRECT items anymore */
- Assert(ItemIdIsNormal(itemid));
-
- htup = (HeapTupleHeader) PageGetItem(page, itemid);
- if (htup->t_infomask & HEAP_XMIN_COMMITTED)
- continue;
-
- /*
- * See comments in the walk-along-page loop above about why
- * only MOVED_OFF tuples should be found here.
- */
- if (htup->t_infomask & HEAP_MOVED_IN)
- elog(ERROR, "HEAP_MOVED_IN was not expected");
- if (!(htup->t_infomask & HEAP_MOVED_OFF))
- elog(ERROR, "HEAP_MOVED_OFF was expected");
- if (HeapTupleHeaderGetXvac(htup) != myXID)
- elog(ERROR, "invalid XVAC in tuple header");
-
- ItemIdSetUnused(itemid);
- num_tuples++;
-
- unused[uncnt++] = offnum;
- }
- Assert(vacpage->offsets_free == num_tuples);
-
- START_CRIT_SECTION();
-
- PageRepairFragmentation(page);
-
- MarkBufferDirty(buf);
-
- /* XLOG stuff */
- if (!onerel->rd_istemp)
- {
- XLogRecPtr recptr;
-
- recptr = log_heap_clean(onerel, buf,
- NULL, 0, NULL, 0,
- unused, uncnt,
- false);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
-
- END_CRIT_SECTION();
-
- UnlockReleaseBuffer(buf);
- }
-
- /* now - free new list of reaped pages */
- curpage = Nvacpagelist.pagedesc;
- for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
- pfree(*curpage);
- pfree(Nvacpagelist.pagedesc);
- }
-
- /* Truncate relation, if needed */
- if (blkno < nblocks)
- {
- RelationTruncate(onerel, blkno);
- vacrelstats->rel_pages = blkno; /* set new number of blocks */
- }
-
- /* clean up */
- pfree(vacpage);
- if (vacrelstats->vtlinks != NULL)
- pfree(vacrelstats->vtlinks);
-
- ExecContext_Finish(&ec);
-}
-
-/*
- * move_chain_tuple() -- move one tuple that is part of a tuple chain
- *
- * This routine moves old_tup from old_page to dst_page.
- * old_page and dst_page might be the same page.
- * On entry old_buf and dst_buf are locked exclusively, both locks (or
- * the single lock, if this is a intra-page-move) are released before
- * exit.
- *
- * Yes, a routine with ten parameters is ugly, but it's still better
- * than having these 120 lines of code in repair_frag() which is
- * already too long and almost unreadable.
- */
-static void
-move_chain_tuple(Relation rel,
- Buffer old_buf, Page old_page, HeapTuple old_tup,
- Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
- ExecContext ec, ItemPointer ctid, bool cleanVpd)
-{
- TransactionId myXID = GetCurrentTransactionId();
- HeapTupleData newtup;
- OffsetNumber newoff;
- ItemId newitemid;
- Size tuple_len = old_tup->t_len;
-
- /*
- * make a modifiable copy of the source tuple.
- */
- heap_copytuple_with_tuple(old_tup, &newtup);
-
- /*
- * register invalidation of source tuple in catcaches.
- */
- CacheInvalidateHeapTuple(rel, old_tup);
-
- /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
- START_CRIT_SECTION();
-
- /*
- * mark the source tuple MOVED_OFF.
- */
- old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
- HEAP_XMIN_INVALID |
- HEAP_MOVED_IN);
- old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
- HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
-
- /*
- * If this page was not used before - clean it.
- *
- * NOTE: a nasty bug used to lurk here. It is possible for the source and
- * destination pages to be the same (since this tuple-chain member can be
- * on a page lower than the one we're currently processing in the outer
- * loop). If that's true, then after vacuum_page() the source tuple will
- * have been moved, and tuple.t_data will be pointing at garbage.
- * Therefore we must do everything that uses old_tup->t_data BEFORE this
- * step!!
- *
- * This path is different from the other callers of vacuum_page, because
- * we have already incremented the vacpage's offsets_used field to account
- * for the tuple(s) we expect to move onto the page. Therefore
- * vacuum_page's check for offsets_used == 0 is wrong. But since that's a
- * good debugging check for all other callers, we work around it here
- * rather than remove it.
- */
- if (!PageIsEmpty(dst_page) && cleanVpd)
- {
- int sv_offsets_used = dst_vacpage->offsets_used;
-
- dst_vacpage->offsets_used = 0;
- vacuum_page(rel, dst_buf, dst_vacpage);
- dst_vacpage->offsets_used = sv_offsets_used;
- }
-
- /*
- * Update the state of the copied tuple, and store it on the destination
- * page. The copied tuple is never part of a HOT chain.
- */
- newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
- HEAP_XMIN_INVALID |
- HEAP_MOVED_OFF);
- newtup.t_data->t_infomask |= HEAP_MOVED_IN;
- HeapTupleHeaderClearHotUpdated(newtup.t_data);
- HeapTupleHeaderClearHeapOnly(newtup.t_data);
- HeapTupleHeaderSetXvac(newtup.t_data, myXID);
- newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
- InvalidOffsetNumber, false, true);
- if (newoff == InvalidOffsetNumber)
- elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
- (unsigned long) tuple_len, dst_vacpage->blkno);
- newitemid = PageGetItemId(dst_page, newoff);
- /* drop temporary copy, and point to the version on the dest page */
- pfree(newtup.t_data);
- newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
-
- ItemPointerSet(&(newtup.t_self), dst_vacpage->blkno, newoff);
-
- /*
- * Set new tuple's t_ctid pointing to itself if last tuple in chain, and
- * to next tuple in chain otherwise. (Since we move the chain in reverse
- * order, this is actually the previously processed tuple.)
- */
- if (!ItemPointerIsValid(ctid))
- newtup.t_data->t_ctid = newtup.t_self;
- else
- newtup.t_data->t_ctid = *ctid;
- *ctid = newtup.t_self;
-
- MarkBufferDirty(dst_buf);
- if (dst_buf != old_buf)
- MarkBufferDirty(old_buf);
-
- /* XLOG stuff */
- if (!rel->rd_istemp)
- {
- XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
- dst_buf, &newtup);
-
- if (old_buf != dst_buf)
- {
- PageSetLSN(old_page, recptr);
- PageSetTLI(old_page, ThisTimeLineID);
- }
- PageSetLSN(dst_page, recptr);
- PageSetTLI(dst_page, ThisTimeLineID);
- }
-
- END_CRIT_SECTION();
-
- LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
- if (dst_buf != old_buf)
- LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
-
- /* Create index entries for the moved tuple */
- if (ec->resultRelInfo->ri_NumIndices > 0)
- {
- ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
- ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
- ResetPerTupleExprContext(ec->estate);
- }
-}
-
-/*
- * move_plain_tuple() -- move one tuple that is not part of a chain
- *
- * This routine moves old_tup from old_page to dst_page.
- * On entry old_buf and dst_buf are locked exclusively, both locks are
- * released before exit.
- *
- * Yes, a routine with eight parameters is ugly, but it's still better
- * than having these 90 lines of code in repair_frag() which is already
- * too long and almost unreadable.
- */
-static void
-move_plain_tuple(Relation rel,
- Buffer old_buf, Page old_page, HeapTuple old_tup,
- Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
- ExecContext ec)
-{
- TransactionId myXID = GetCurrentTransactionId();
- HeapTupleData newtup;
- OffsetNumber newoff;
- ItemId newitemid;
- Size tuple_len = old_tup->t_len;
-
- /* copy tuple */
- heap_copytuple_with_tuple(old_tup, &newtup);
-
- /*
- * register invalidation of source tuple in catcaches.
- *
- * (Note: we do not need to register the copied tuple, because we are not
- * changing the tuple contents and so there cannot be any need to flush
- * negative catcache entries.)
- */
- CacheInvalidateHeapTuple(rel, old_tup);
-
- /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
- START_CRIT_SECTION();
-
- /*
- * Mark new tuple as MOVED_IN by me; also mark it not HOT.
- */
- newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
- HEAP_XMIN_INVALID |
- HEAP_MOVED_OFF);
- newtup.t_data->t_infomask |= HEAP_MOVED_IN;
- HeapTupleHeaderClearHotUpdated(newtup.t_data);
- HeapTupleHeaderClearHeapOnly(newtup.t_data);
- HeapTupleHeaderSetXvac(newtup.t_data, myXID);
-
- /* add tuple to the page */
- newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
- InvalidOffsetNumber, false, true);
- if (newoff == InvalidOffsetNumber)
- elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
- (unsigned long) tuple_len,
- dst_vacpage->blkno, (unsigned long) dst_vacpage->free,
- dst_vacpage->offsets_used, dst_vacpage->offsets_free);
- newitemid = PageGetItemId(dst_page, newoff);
- pfree(newtup.t_data);
- newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
- ItemPointerSet(&(newtup.t_data->t_ctid), dst_vacpage->blkno, newoff);
- newtup.t_self = newtup.t_data->t_ctid;
-
- /*
- * Mark old tuple as MOVED_OFF by me.
- */
- old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
- HEAP_XMIN_INVALID |
- HEAP_MOVED_IN);
- old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
- HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
-
- MarkBufferDirty(dst_buf);
- MarkBufferDirty(old_buf);
-
- /* XLOG stuff */
- if (!rel->rd_istemp)
- {
- XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self,
- dst_buf, &newtup);
-
- PageSetLSN(old_page, recptr);
- PageSetTLI(old_page, ThisTimeLineID);
- PageSetLSN(dst_page, recptr);
- PageSetTLI(dst_page, ThisTimeLineID);
- }
-
- END_CRIT_SECTION();
-
- dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page);
- LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
- LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
-
- dst_vacpage->offsets_used++;
-
- /* insert index' tuples if needed */
- if (ec->resultRelInfo->ri_NumIndices > 0)
- {
- ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
- ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
- ResetPerTupleExprContext(ec->estate);
- }
-}
-
-/*
- * update_hint_bits() -- update hint bits in destination pages
- *
- * Scan all the pages that we moved tuples onto and update tuple status bits.
- * This is not really necessary, but it will save time for future transactions
- * examining these tuples.
- *
- * This pass guarantees that all HEAP_MOVED_IN tuples are marked as
- * XMIN_COMMITTED, so that future tqual tests won't need to check their XVAC.
- *
- * BUT NOTICE that this code fails to clear HEAP_MOVED_OFF tuples from
- * pages that were move source pages but not move dest pages. The bulk
- * of the move source pages will be physically truncated from the relation,
- * and the last page remaining in the rel will be fixed separately in
- * repair_frag(), so the only cases where a MOVED_OFF tuple won't get its
- * hint bits updated are tuples that are moved as part of a chain and were
- * on pages that were not either move destinations nor at the end of the rel.
- * To completely ensure that no MOVED_OFF tuples remain unmarked, we'd have
- * to remember and revisit those pages too.
- *
- * One wonders whether it wouldn't be better to skip this work entirely,
- * and let the tuple status updates happen someplace that's not holding an
- * exclusive lock on the relation.
- */
-static void
-update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
- BlockNumber last_move_dest_block, int num_moved)
-{
- TransactionId myXID = GetCurrentTransactionId();
- int checked_moved = 0;
- int i;
- VacPage *curpage;
-
- for (i = 0, curpage = fraged_pages->pagedesc;
- i < num_fraged_pages;
- i++, curpage++)
- {
- Buffer buf;
- Page page;
- OffsetNumber max_offset;
- OffsetNumber off;
- int num_tuples = 0;
-
- vacuum_delay_point();
-
- if ((*curpage)->blkno > last_move_dest_block)
- break; /* no need to scan any further */
- if ((*curpage)->offsets_used == 0)
- continue; /* this page was never used as a move dest */
- buf = ReadBufferWithStrategy(rel, (*curpage)->blkno, vac_strategy);
- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
- page = BufferGetPage(buf);
- max_offset = PageGetMaxOffsetNumber(page);
- for (off = FirstOffsetNumber;
- off <= max_offset;
- off = OffsetNumberNext(off))
- {
- ItemId itemid = PageGetItemId(page, off);
- HeapTupleHeader htup;
-
- if (!ItemIdIsUsed(itemid))
- continue;
- /* Shouldn't be any DEAD or REDIRECT items anymore */
- Assert(ItemIdIsNormal(itemid));
-
- htup = (HeapTupleHeader) PageGetItem(page, itemid);
- if (htup->t_infomask & HEAP_XMIN_COMMITTED)
- continue;
-
- /*
- * Here we may see either MOVED_OFF or MOVED_IN tuples.
- */
- if (!(htup->t_infomask & HEAP_MOVED))
- elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
- if (HeapTupleHeaderGetXvac(htup) != myXID)
- elog(ERROR, "invalid XVAC in tuple header");
-
- if (htup->t_infomask & HEAP_MOVED_IN)
- {
- htup->t_infomask |= HEAP_XMIN_COMMITTED;
- htup->t_infomask &= ~HEAP_MOVED;
- num_tuples++;
- }
- else
- htup->t_infomask |= HEAP_XMIN_INVALID;
- }
- MarkBufferDirty(buf);
- UnlockReleaseBuffer(buf);
- Assert((*curpage)->offsets_used == num_tuples);
- checked_moved += num_tuples;
- }
- Assert(num_moved == checked_moved);
-}
-
-/*
- * vacuum_heap() -- free dead tuples
- *
- * This routine marks dead tuples as unused and truncates relation
- * if there are "empty" end-blocks.
- */
-static void
-vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
-{
- Buffer buf;
- VacPage *vacpage;
- BlockNumber relblocks;
- int nblocks;
- int i;
-
- nblocks = vacuum_pages->num_pages;
- nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with them */
-
- for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
- {
- vacuum_delay_point();
-
- if ((*vacpage)->offsets_free > 0)
- {
- buf = ReadBufferWithStrategy(onerel,
- (*vacpage)->blkno,
- vac_strategy);
- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
- vacuum_page(onerel, buf, *vacpage);
- UnlockReleaseBuffer(buf);
- }
- }
-
- /* Truncate relation if there are some empty end-pages */
- Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
- if (vacuum_pages->empty_end_pages > 0)
- {
- relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
- ereport(elevel,
- (errmsg("\"%s\": truncated %u to %u pages",
- RelationGetRelationName(onerel),
- vacrelstats->rel_pages, relblocks)));
- RelationTruncate(onerel, relblocks);
- vacrelstats->rel_pages = relblocks; /* set new number of blocks */
- }
-}
-
-/*
- * vacuum_page() -- free dead tuples on a page
- * and repair its fragmentation.
- *
- * Caller must hold pin and lock on buffer.
- */
-static void
-vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
-{
- Page page = BufferGetPage(buffer);
- int i;
-
- /* There shouldn't be any tuples moved onto the page yet! */
- Assert(vacpage->offsets_used == 0);
-
- START_CRIT_SECTION();
-
- for (i = 0; i < vacpage->offsets_free; i++)
- {
- ItemId itemid = PageGetItemId(page, vacpage->offsets[i]);
-
- ItemIdSetUnused(itemid);
- }
-
- PageRepairFragmentation(page);
-
- MarkBufferDirty(buffer);
-
- /* XLOG stuff */
- if (!onerel->rd_istemp)
- {
- XLogRecPtr recptr;
-
- recptr = log_heap_clean(onerel, buffer,
- NULL, 0, NULL, 0,
- vacpage->offsets, vacpage->offsets_free,
- false);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
-
- END_CRIT_SECTION();
-}
-
-/*
- * scan_index() -- scan one index relation to update pg_class statistics.
- *
- * We use this when we have no deletions to do.
- */
-static void
-scan_index(Relation indrel, double num_tuples)
-{
- IndexBulkDeleteResult *stats;
- IndexVacuumInfo ivinfo;
- PGRUsage ru0;
-
- pg_rusage_init(&ru0);
-
- ivinfo.index = indrel;
- ivinfo.vacuum_full = true;
- ivinfo.message_level = elevel;
- ivinfo.num_heap_tuples = num_tuples;
- ivinfo.strategy = vac_strategy;
-
- stats = index_vacuum_cleanup(&ivinfo, NULL);
-
- if (!stats)
- return;
-
- /* now update statistics in pg_class */
- vac_update_relstats(RelationGetRelid(indrel),
- stats->num_pages, stats->num_index_tuples,
- false, InvalidTransactionId);
-
- ereport(elevel,
- (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
- RelationGetRelationName(indrel),
- stats->num_index_tuples,
- stats->num_pages),
- errdetail("%u index pages have been deleted, %u are currently reusable.\n"
- "%s.",
- stats->pages_deleted, stats->pages_free,
- pg_rusage_show(&ru0))));
-
- /*
- * Check for tuple count mismatch. If the index is partial, then it's OK
- * for it to have fewer tuples than the heap; else we got trouble.
- */
- if (stats->num_index_tuples != num_tuples)
- {
- if (stats->num_index_tuples > num_tuples ||
- !vac_is_partial_index(indrel))
- ereport(WARNING,
- (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
- RelationGetRelationName(indrel),
- stats->num_index_tuples, num_tuples),
- errhint("Rebuild the index with REINDEX.")));
- }
-
- pfree(stats);
-}
-
-/*
- * vacuum_index() -- vacuum one index relation.
- *
- * Vpl is the VacPageList of the heap we're currently vacuuming.
- * It's locked. Indrel is an index relation on the vacuumed heap.
- *
- * We don't bother to set locks on the index relation here, since
- * the parent table is exclusive-locked already.
- *
- * Finally, we arrange to update the index relation's statistics in
- * pg_class.
- */
-static void
-vacuum_index(VacPageList vacpagelist, Relation indrel,
- double num_tuples, int keep_tuples)
-{
- IndexBulkDeleteResult *stats;
- IndexVacuumInfo ivinfo;
- PGRUsage ru0;
-
- pg_rusage_init(&ru0);
-
- ivinfo.index = indrel;
- ivinfo.vacuum_full = true;
- ivinfo.message_level = elevel;
- ivinfo.num_heap_tuples = num_tuples + keep_tuples;
- ivinfo.strategy = vac_strategy;
-
- /* Do bulk deletion */
- stats = index_bulk_delete(&ivinfo, NULL, tid_reaped, (void *) vacpagelist);
-
- /* Do post-VACUUM cleanup */
- stats = index_vacuum_cleanup(&ivinfo, stats);
-
- if (!stats)
- return;
-
- /* now update statistics in pg_class */
- vac_update_relstats(RelationGetRelid(indrel),
- stats->num_pages, stats->num_index_tuples,
- false, InvalidTransactionId);
-
- ereport(elevel,
- (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
- RelationGetRelationName(indrel),
- stats->num_index_tuples,
- stats->num_pages),
- errdetail("%.0f index row versions were removed.\n"
- "%u index pages have been deleted, %u are currently reusable.\n"
- "%s.",
- stats->tuples_removed,
- stats->pages_deleted, stats->pages_free,
- pg_rusage_show(&ru0))));
-
- /*
- * Check for tuple count mismatch. If the index is partial, then it's OK
- * for it to have fewer tuples than the heap; else we got trouble.
- */
- if (stats->num_index_tuples != num_tuples + keep_tuples)
- {
- if (stats->num_index_tuples > num_tuples + keep_tuples ||
- !vac_is_partial_index(indrel))
- ereport(WARNING,
- (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
- RelationGetRelationName(indrel),
- stats->num_index_tuples, num_tuples + keep_tuples),
- errhint("Rebuild the index with REINDEX.")));
- }
-
- pfree(stats);
-}
-
-/*
- * tid_reaped() -- is a particular tid reaped?
- *
- * This has the right signature to be an IndexBulkDeleteCallback.
- *
- * vacpagelist->VacPage_array is sorted in right order.
- */
-static bool
-tid_reaped(ItemPointer itemptr, void *state)
-{
- VacPageList vacpagelist = (VacPageList) state;
- OffsetNumber ioffno;
- OffsetNumber *voff;
- VacPage vp,
- *vpp;
- VacPageData vacpage;
-
- vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
- ioffno = ItemPointerGetOffsetNumber(itemptr);
-
- vp = &vacpage;
- vpp = (VacPage *) vac_bsearch((void *) &vp,
- (void *) (vacpagelist->pagedesc),
- vacpagelist->num_pages,
- sizeof(VacPage),
- vac_cmp_blk);
-
- if (vpp == NULL)
- return false;
-
- /* ok - we are on a partially or fully reaped page */
- vp = *vpp;
-
- if (vp->offsets_free == 0)
- {
- /* this is EmptyPage, so claim all tuples on it are reaped!!! */
- return true;
- }
-
- voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
- (void *) (vp->offsets),
- vp->offsets_free,
- sizeof(OffsetNumber),
- vac_cmp_offno);
-
- if (voff == NULL)
- return false;
-
- /* tid is reaped */
- return true;
-}
-
-/*
- * Update the shared Free Space Map with the info we now have about
- * free space in the relation, discarding any old info the map may have.
- */
-static void
-vac_update_fsm(Relation onerel, VacPageList fraged_pages,
- BlockNumber rel_pages)
-{
- int nPages = fraged_pages->num_pages;
- VacPage *pagedesc = fraged_pages->pagedesc;
- Size threshold;
- FSMPageData *pageSpaces;
- int outPages;
- int i;
-
- /*
- * We only report pages with free space at least equal to the average
- * request size --- this avoids cluttering FSM with uselessly-small bits
- * of space. Although FSM would discard pages with little free space
- * anyway, it's important to do this prefiltering because (a) it reduces
- * the time spent holding the FSM lock in RecordRelationFreeSpace, and (b)
- * FSM uses the number of pages reported as a statistic for guiding space
- * management. If we didn't threshold our reports the same way
- * vacuumlazy.c does, we'd be skewing that statistic.
- */
- threshold = GetAvgFSMRequestSize(&onerel->rd_node);
-
- pageSpaces = (FSMPageData *) palloc(nPages * sizeof(FSMPageData));
- outPages = 0;
-
- for (i = 0; i < nPages; i++)
- {
- /*
- * fraged_pages may contain entries for pages that we later decided to
- * truncate from the relation; don't enter them into the free space
- * map!
- */
- if (pagedesc[i]->blkno >= rel_pages)
- break;
-
- if (pagedesc[i]->free >= threshold)
- {
- FSMPageSetPageNum(&pageSpaces[outPages], pagedesc[i]->blkno);
- FSMPageSetSpace(&pageSpaces[outPages], pagedesc[i]->free);
- outPages++;
- }
- }
-
- RecordRelationFreeSpace(&onerel->rd_node, outPages, outPages, pageSpaces);
-
- pfree(pageSpaces);
-}
-
-/* Copy a VacPage structure */
-static VacPage
-copy_vac_page(VacPage vacpage)
-{
- VacPage newvacpage;
-
- /* allocate a VacPageData entry */
- newvacpage = (VacPage) palloc(sizeof(VacPageData) +
- vacpage->offsets_free * sizeof(OffsetNumber));
-
- /* fill it in */
- if (vacpage->offsets_free > 0)
- memcpy(newvacpage->offsets, vacpage->offsets,
- vacpage->offsets_free * sizeof(OffsetNumber));
- newvacpage->blkno = vacpage->blkno;
- newvacpage->free = vacpage->free;
- newvacpage->offsets_used = vacpage->offsets_used;
- newvacpage->offsets_free = vacpage->offsets_free;
-
- return newvacpage;
-}
-
-/*
- * Add a VacPage pointer to a VacPageList.
- *
- * As a side effect of the way that scan_heap works,
- * higher pages come after lower pages in the array
- * (and highest tid on a page is last).
- */
-static void
-vpage_insert(VacPageList vacpagelist, VacPage vpnew)
-{
-#define PG_NPAGEDESC 1024
-
- /* allocate a VacPage entry if needed */
- if (vacpagelist->num_pages == 0)
- {
- vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
- vacpagelist->num_allocated_pages = PG_NPAGEDESC;
- }
- else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
- {
- vacpagelist->num_allocated_pages *= 2;
- vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
- }
- vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
- (vacpagelist->num_pages)++;
-}
-
-/*
- * vac_bsearch: just like standard C library routine bsearch(),
- * except that we first test to see whether the target key is outside
- * the range of the table entries. This case is handled relatively slowly
- * by the normal binary search algorithm (ie, no faster than any other key)
- * but it occurs often enough in VACUUM to be worth optimizing.
- */
-static void *
-vac_bsearch(const void *key, const void *base,
- size_t nelem, size_t size,
- int (*compar) (const void *, const void *))
-{
- int res;
- const void *last;
-
- if (nelem == 0)
- return NULL;
- res = compar(key, base);
- if (res < 0)
- return NULL;
- if (res == 0)
- return (void *) base;
- if (nelem > 1)
- {
- last = (const void *) ((const char *) base + (nelem - 1) * size);
- res = compar(key, last);
- if (res > 0)
- return NULL;
- if (res == 0)
- return (void *) last;
- }
- if (nelem <= 2)
- return NULL; /* already checked 'em all */
- return bsearch(key, base, nelem, size, compar);
-}
-
-/*
- * Comparator routines for use with qsort() and bsearch().
- */
-static int
-vac_cmp_blk(const void *left, const void *right)
-{
- BlockNumber lblk,
- rblk;
-
- lblk = (*((VacPage *) left))->blkno;
- rblk = (*((VacPage *) right))->blkno;
-
- if (lblk < rblk)
- return -1;
- if (lblk == rblk)
- return 0;
- return 1;
-}
-
-static int
-vac_cmp_offno(const void *left, const void *right)
-{
- if (*(OffsetNumber *) left < *(OffsetNumber *) right)
- return -1;
- if (*(OffsetNumber *) left == *(OffsetNumber *) right)
- return 0;
- return 1;
-}
-
-static int
-vac_cmp_vtlinks(const void *left, const void *right)
-{
- if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
- ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
- return -1;
- if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
- ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
- return 1;
- /* bi_hi-es are equal */
- if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
- ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
- return -1;
- if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
- ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
- return 1;
- /* bi_lo-es are equal */
- if (((VTupleLink) left)->new_tid.ip_posid <
- ((VTupleLink) right)->new_tid.ip_posid)
- return -1;
- if (((VTupleLink) left)->new_tid.ip_posid >
- ((VTupleLink) right)->new_tid.ip_posid)
- return 1;
- return 0;
-}
-
-
-/*
- * Open all the indexes of the given relation, obtaining the specified kind
- * of lock on each. Return an array of Relation pointers for the indexes
- * into *Irel, and the number of indexes into *nindexes.
- */
-void
-vac_open_indexes(Relation relation, LOCKMODE lockmode,
- int *nindexes, Relation **Irel)
-{
- List *indexoidlist;
- ListCell *indexoidscan;
- int i;
-
- Assert(lockmode != NoLock);
-
- indexoidlist = RelationGetIndexList(relation);
-
- *nindexes = list_length(indexoidlist);
-
- if (*nindexes > 0)
- *Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
- else
- *Irel = NULL;
-
- i = 0;
- foreach(indexoidscan, indexoidlist)
- {
- Oid indexoid = lfirst_oid(indexoidscan);
-
- (*Irel)[i++] = index_open(indexoid, lockmode);
- }
-
- list_free(indexoidlist);
-}
-
-/*
- * Release the resources acquired by vac_open_indexes. Optionally release
- * the locks (say NoLock to keep 'em).
- */
-void
-vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
+void
+vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
{
if (Irel == NULL)
return;
pfree(Irel);
}
-
-/*
- * Is an index partial (ie, could it contain fewer tuples than the heap?)
- */
-bool
-vac_is_partial_index(Relation indrel)
-{
- /*
- * If the index's AM doesn't support nulls, it's partial for our purposes
- */
- if (!indrel->rd_am->amindexnulls)
- return true;
-
- /* Otherwise, look to see if there's a partial-index predicate */
- if (!heap_attisnull(indrel->rd_indextuple, Anum_pg_index_indpred))
- return true;
-
- return false;
-}
-
-
-static bool
-enough_space(VacPage vacpage, Size len)
-{
- len = MAXALIGN(len);
-
- if (len > vacpage->free)
- return false;
-
- /* if there are free itemid(s) and len <= free_space... */
- if (vacpage->offsets_used < vacpage->offsets_free)
- return true;
-
- /* noff_used >= noff_free and so we'll have to allocate new itemid */
- if (len + sizeof(ItemIdData) <= vacpage->free)
- return true;
-
- return false;
-}
-
-static Size
-PageGetFreeSpaceWithFillFactor(Relation relation, Page page)
-{
- /*
- * It is correct to use PageGetExactFreeSpace() here, *not*
- * PageGetHeapFreeSpace(). This is because (a) we do our own, exact
- * accounting for whether line pointers must be added, and (b) we will
- * recycle any LP_DEAD line pointers before starting to add rows to a
- * page, but that may not have happened yet at the time this function is
- * applied to a page, which means PageGetHeapFreeSpace()'s protection
- * against too many line pointers on a page could fire incorrectly. We do
- * not need that protection here: since VACUUM FULL always recycles all
- * dead line pointers first, it'd be physically impossible to insert more
- * than MaxHeapTuplesPerPage tuples anyway.
- */
- Size freespace = PageGetExactFreeSpace(page);
- Size targetfree;
-
- targetfree = RelationGetTargetPageFreeSpace(relation,
- HEAP_DEFAULT_FILLFACTOR);
- if (freespace > targetfree)
- return freespace - targetfree;
- else
- return 0;
-}
-
/*
* vacuum_delay_point --- check for interrupts and cost-based delay.
*