pfree(isnull);
}
+static bool
+heapam_scan_analyze_next_block(TableScanDesc sscan, BlockNumber blockno,
+ BufferAccessStrategy bstrategy)
+{
+ HeapScanDesc scan = (HeapScanDesc) sscan;
+
+ /*
+ * We must maintain a pin on the target page's buffer to ensure that
+ * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
+ * under us. Hence, pin the page until we are done looking at it. We
+ * also choose to hold sharelock on the buffer throughout --- we could
+ * release and re-acquire sharelock for each tuple, but since we aren't
+ * doing much work per tuple, the extra lock traffic is probably better
+ * avoided.
+ */
+ scan->rs_cblock = blockno;
+ scan->rs_cindex = FirstOffsetNumber;
+ scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM,
+ blockno, RBM_NORMAL, bstrategy);
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+ /* in heap all blocks can contain tuples, so always return true */
+ return true;
+}
+
+static bool
+heapam_scan_analyze_next_tuple(TableScanDesc sscan, TransactionId OldestXmin,
+ double *liverows, double *deadrows,
+ TupleTableSlot *slot)
+{
+ HeapScanDesc scan = (HeapScanDesc) sscan;
+ Page targpage;
+ OffsetNumber maxoffset;
+ BufferHeapTupleTableSlot *hslot;
+
+ Assert(TTS_IS_BUFFERTUPLE(slot));
+
+ hslot = (BufferHeapTupleTableSlot *) slot;
+ targpage = BufferGetPage(scan->rs_cbuf);
+ maxoffset = PageGetMaxOffsetNumber(targpage);
+
+ /* Inner loop over all tuples on the selected page */
+ for (; scan->rs_cindex <= maxoffset; scan->rs_cindex++)
+ {
+ ItemId itemid;
+ HeapTuple targtuple = &hslot->base.tupdata;
+ bool sample_it = false;
+
+ itemid = PageGetItemId(targpage, scan->rs_cindex);
+
+ /*
+ * We ignore unused and redirect line pointers. DEAD line pointers
+ * should be counted as dead, because we need vacuum to run to get rid
+ * of them. Note that this rule agrees with the way that
+ * heap_page_prune() counts things.
+ */
+ if (!ItemIdIsNormal(itemid))
+ {
+ if (ItemIdIsDead(itemid))
+ *deadrows += 1;
+ continue;
+ }
+
+ ItemPointerSet(&targtuple->t_self, scan->rs_cblock, scan->rs_cindex);
+
+ targtuple->t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
+ targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
+ targtuple->t_len = ItemIdGetLength(itemid);
+
+ switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin, scan->rs_cbuf))
+ {
+ case HEAPTUPLE_LIVE:
+ sample_it = true;
+ *liverows += 1;
+ break;
+
+ case HEAPTUPLE_DEAD:
+ case HEAPTUPLE_RECENTLY_DEAD:
+ /* Count dead and recently-dead rows */
+ *deadrows += 1;
+ break;
+
+ case HEAPTUPLE_INSERT_IN_PROGRESS:
+
+ /*
+ * Insert-in-progress rows are not counted. We assume that
+ * when the inserting transaction commits or aborts, it will
+ * send a stats message to increment the proper count. This
+ * works right only if that transaction ends after we finish
+ * analyzing the table; if things happen in the other order,
+ * its stats update will be overwritten by ours. However, the
+ * error will be large only if the other transaction runs long
+ * enough to insert many tuples, so assuming it will finish
+ * after us is the safer option.
+ *
+ * A special case is that the inserting transaction might be
+ * our own. In this case we should count and sample the row,
+ * to accommodate users who load a table and analyze it in one
+ * transaction. (pgstat_report_analyze has to adjust the
+ * numbers we send to the stats collector to make this come
+ * out right.)
+ */
+ if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
+ {
+ sample_it = true;
+ *liverows += 1;
+ }
+ break;
+
+ case HEAPTUPLE_DELETE_IN_PROGRESS:
+
+ /*
+ * We count and sample delete-in-progress rows the same as
+ * live ones, so that the stats counters come out right if the
+ * deleting transaction commits after us, per the same
+ * reasoning given above.
+ *
+ * If the delete was done by our own transaction, however, we
+ * must count the row as dead to make pgstat_report_analyze's
+ * stats adjustments come out right. (Note: this works out
+ * properly when the row was both inserted and deleted in our
+ * xact.)
+ *
+ * The net effect of these choices is that we act as though an
+ * IN_PROGRESS transaction hasn't happened yet, except if it
+ * is our own transaction, which we assume has happened.
+ *
+ * This approach ensures that we behave sanely if we see both
+ * the pre-image and post-image rows for a row being updated
+ * by a concurrent transaction: we will sample the pre-image
+ * but not the post-image. We also get sane results if the
+ * concurrent transaction never commits.
+ */
+ if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
+ deadrows += 1;
+ else
+ {
+ sample_it = true;
+ liverows += 1;
+ }
+ break;
+
+ default:
+ elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+ break;
+ }
+
+ if (sample_it)
+ {
+ ExecStoreBufferHeapTuple(targtuple, slot, scan->rs_cbuf);
+ scan->rs_cindex++;
+
+ /* note that we leave the buffer locked here! */
+ return true;
+ }
+ }
+
+ /* Now release the lock and pin on the page */
+ UnlockReleaseBuffer(scan->rs_cbuf);
+ scan->rs_cbuf = InvalidBuffer;
+
+ /* also prevent old slot contents from having pin on page */
+ ExecClearTuple(slot);
+
+ return false;
+}
+
static double
heapam_index_build_range_scan(Relation heapRelation,
Relation indexRelation,
.relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
.relation_copy_data = heapam_relation_copy_data,
.relation_copy_for_cluster = heapam_relation_copy_for_cluster,
+ .relation_vacuum = heap_vacuum_rel,
+ .scan_analyze_next_block = heapam_scan_analyze_next_block,
+ .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
.index_build_range_scan = heapam_index_build_range_scan,
.index_validate_scan = heapam_index_validate_scan,
};
#include <math.h>
#include "access/genam.h"
-#include "access/heapam.h"
#include "access/multixact.h"
#include "access/relation.h"
#include "access/sysattr.h"
#include "access/table.h"
+#include "access/tableam.h"
#include "access/transam.h"
#include "access/tupconvert.h"
#include "access/tuptoaster.h"
TransactionId OldestXmin;
BlockSamplerData bs;
ReservoirStateData rstate;
+ TupleTableSlot *slot;
+ TableScanDesc scan;
Assert(targrows > 0);
/* Prepare for sampling rows */
reservoir_init_selection_state(&rstate, targrows);
+ scan = table_beginscan_analyze(onerel);
+ slot = table_slot_create(onerel, NULL);
+
/* Outer loop over blocks to sample */
while (BlockSampler_HasMore(&bs))
{
BlockNumber targblock = BlockSampler_Next(&bs);
- Buffer targbuffer;
- Page targpage;
- OffsetNumber targoffset,
- maxoffset;
vacuum_delay_point();
- /*
- * We must maintain a pin on the target page's buffer to ensure that
- * the maxoffset value stays good (else concurrent VACUUM might delete
- * tuples out from under us). Hence, pin the page until we are done
- * looking at it. We also choose to hold sharelock on the buffer
- * throughout --- we could release and re-acquire sharelock for each
- * tuple, but since we aren't doing much work per tuple, the extra
- * lock traffic is probably better avoided.
- */
- targbuffer = ReadBufferExtended(onerel, MAIN_FORKNUM, targblock,
- RBM_NORMAL, vac_strategy);
- LockBuffer(targbuffer, BUFFER_LOCK_SHARE);
- targpage = BufferGetPage(targbuffer);
- maxoffset = PageGetMaxOffsetNumber(targpage);
-
- /* Inner loop over all tuples on the selected page */
- for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++)
- {
- ItemId itemid;
- HeapTupleData targtuple;
- bool sample_it = false;
-
- itemid = PageGetItemId(targpage, targoffset);
+ if (!table_scan_analyze_next_block(scan, targblock, vac_strategy))
+ continue;
+ while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot))
+ {
/*
- * We ignore unused and redirect line pointers. DEAD line
- * pointers should be counted as dead, because we need vacuum to
- * run to get rid of them. Note that this rule agrees with the
- * way that heap_page_prune() counts things.
+ * The first targrows sample rows are simply copied into the
+ * reservoir. Then we start replacing tuples in the sample until
+ * we reach the end of the relation. This algorithm is from Jeff
+ * Vitter's paper (see full citation below). It works by
+ * repeatedly computing the number of tuples to skip before
+ * selecting a tuple, which replaces a randomly chosen element of
+ * the reservoir (current set of tuples). At all times the
+ * reservoir is a true random sample of the tuples we've passed
+ * over so far, so when we fall off the end of the relation we're
+ * done.
*/
- if (!ItemIdIsNormal(itemid))
- {
- if (ItemIdIsDead(itemid))
- deadrows += 1;
- continue;
- }
-
- ItemPointerSet(&targtuple.t_self, targblock, targoffset);
-
- targtuple.t_tableOid = RelationGetRelid(onerel);
- targtuple.t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
- targtuple.t_len = ItemIdGetLength(itemid);
-
- switch (HeapTupleSatisfiesVacuum(&targtuple,
- OldestXmin,
- targbuffer))
- {
- case HEAPTUPLE_LIVE:
- sample_it = true;
- liverows += 1;
- break;
-
- case HEAPTUPLE_DEAD:
- case HEAPTUPLE_RECENTLY_DEAD:
- /* Count dead and recently-dead rows */
- deadrows += 1;
- break;
-
- case HEAPTUPLE_INSERT_IN_PROGRESS:
-
- /*
- * Insert-in-progress rows are not counted. We assume
- * that when the inserting transaction commits or aborts,
- * it will send a stats message to increment the proper
- * count. This works right only if that transaction ends
- * after we finish analyzing the table; if things happen
- * in the other order, its stats update will be
- * overwritten by ours. However, the error will be large
- * only if the other transaction runs long enough to
- * insert many tuples, so assuming it will finish after us
- * is the safer option.
- *
- * A special case is that the inserting transaction might
- * be our own. In this case we should count and sample
- * the row, to accommodate users who load a table and
- * analyze it in one transaction. (pgstat_report_analyze
- * has to adjust the numbers we send to the stats
- * collector to make this come out right.)
- */
- if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple.t_data)))
- {
- sample_it = true;
- liverows += 1;
- }
- break;
-
- case HEAPTUPLE_DELETE_IN_PROGRESS:
-
- /*
- * We count and sample delete-in-progress rows the same as
- * live ones, so that the stats counters come out right if
- * the deleting transaction commits after us, per the same
- * reasoning given above.
- *
- * If the delete was done by our own transaction, however,
- * we must count the row as dead to make
- * pgstat_report_analyze's stats adjustments come out
- * right. (Note: this works out properly when the row was
- * both inserted and deleted in our xact.)
- *
- * The net effect of these choices is that we act as
- * though an IN_PROGRESS transaction hasn't happened yet,
- * except if it is our own transaction, which we assume
- * has happened.
- *
- * This approach ensures that we behave sanely if we see
- * both the pre-image and post-image rows for a row being
- * updated by a concurrent transaction: we will sample the
- * pre-image but not the post-image. We also get sane
- * results if the concurrent transaction never commits.
- */
- if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple.t_data)))
- deadrows += 1;
- else
- {
- sample_it = true;
- liverows += 1;
- }
- break;
-
- default:
- elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
- break;
- }
-
- if (sample_it)
+ if (numrows < targrows)
+ rows[numrows++] = ExecCopySlotHeapTuple(slot);
+ else
{
/*
- * The first targrows sample rows are simply copied into the
- * reservoir. Then we start replacing tuples in the sample
- * until we reach the end of the relation. This algorithm is
- * from Jeff Vitter's paper (see full citation below). It
- * works by repeatedly computing the number of tuples to skip
- * before selecting a tuple, which replaces a randomly chosen
- * element of the reservoir (current set of tuples). At all
- * times the reservoir is a true random sample of the tuples
- * we've passed over so far, so when we fall off the end of
- * the relation we're done.
+ * t in Vitter's paper is the number of records already
+ * processed. If we need to compute a new S value, we must
+ * use the not-yet-incremented value of samplerows as t.
*/
- if (numrows < targrows)
- rows[numrows++] = heap_copytuple(&targtuple);
- else
+ if (rowstoskip < 0)
+ rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);
+
+ if (rowstoskip <= 0)
{
/*
- * t in Vitter's paper is the number of records already
- * processed. If we need to compute a new S value, we
- * must use the not-yet-incremented value of samplerows as
- * t.
+ * Found a suitable tuple, so save it, replacing one old
+ * tuple at random
*/
- if (rowstoskip < 0)
- rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);
-
- if (rowstoskip <= 0)
- {
- /*
- * Found a suitable tuple, so save it, replacing one
- * old tuple at random
- */
- int k = (int) (targrows * sampler_random_fract(rstate.randstate));
+ int k = (int) (targrows * sampler_random_fract(rstate.randstate));
- Assert(k >= 0 && k < targrows);
- heap_freetuple(rows[k]);
- rows[k] = heap_copytuple(&targtuple);
- }
-
- rowstoskip -= 1;
+ Assert(k >= 0 && k < targrows);
+ heap_freetuple(rows[k]);
+ rows[k] = ExecCopySlotHeapTuple(slot);
}
- samplerows += 1;
+ rowstoskip -= 1;
}
- }
- /* Now release the lock and pin on the page */
- UnlockReleaseBuffer(targbuffer);
+ samplerows += 1;
+ }
}
+ ExecDropSingleTupleTableSlot(slot);
+ table_endscan(scan);
+
/*
* If we didn't find as many tuples as we wanted then we're done. No sort
* is needed, since they're already in order.
cluster_rel(relid, InvalidOid, cluster_options);
}
else
- heap_vacuum_rel(onerel, params, vac_strategy);
+ table_relation_vacuum(onerel, params, vac_strategy);
/* Roll back any GUC changes executed by index functions */
AtEOXact_GUC(false, save_nestlevel);
struct BulkInsertStateData;
struct IndexInfo;
struct IndexBuildCallback;
+struct VacuumParams;
struct ValidateIndexState;
/*
* This callback needs to remove all contents from `rel`'s current
- * relfilenode. No provisions for transactional behaviour need to be
- * made. Often this can be implemented by truncating the underlying
- * storage to its minimal size.
+ * relfilenode. No provisions for transactional behaviour need to be made.
+ * Often this can be implemented by truncating the underlying storage to
+ * its minimal size.
*
* See also table_relation_nontransactional_truncate().
*/
TransactionId OldestXmin, TransactionId FreezeXid, MultiXactId MultiXactCutoff,
double *num_tuples, double *tups_vacuumed, double *tups_recently_dead);
+ /*
+ * React to VACUUM command on the relation. The VACUUM might be user
+ * triggered or by autovacuum. The specific actions performed by the AM
+ * will depend heavily on the individual AM.
+ *
+ * On entry a transaction is already established, and the relation is
+ * locked with a ShareUpdateExclusive lock.
+ *
+ * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through
+ * this routine, even if (in the latter case), part of the same VACUUM
+ * command.
+ *
+ * There probably, in the future, needs to be a separate callback to
+ * integrate with autovacuum's scheduling.
+ */
+ void (*relation_vacuum) (Relation onerel, struct VacuumParams *params,
+ BufferAccessStrategy bstrategy);
+
+ /*
+ * Prepare to analyze block `blockno` of `scan`. The scan has been started
+ * with table_beginscan_analyze(). See also
+ * table_scan_analyze_next_block().
+ *
+ * The callback may acquire resources like locks that are held until
+ * table_scan_analyze_next_tuple() returns false. It e.g. can make sense
+ * to hold a lock until all tuples on a block have been analyzed by
+ * scan_analyze_next_tuple.
+ *
+ * The callback can return false if the block is not suitable for
+ * sampling, e.g. because it's a metapage that could never contain tuples.
+ *
+ * XXX: This obviously is primarily suited for block-based AMs. It's not
+ * clear what a good interface for non block based AMs would be, so don't
+ * try to invent one yet.
+ */
+ bool (*scan_analyze_next_block) (TableScanDesc scan,
+ BlockNumber blockno,
+ BufferAccessStrategy bstrategy);
+
+ /*
+ * See table_scan_analyze_next_tuple().
+ *
+ * Not every AM might have a meaningful concept of dead rows, in which
+ * case it's OK to not increment *deadrows - but note that that may
+ * influence autovacuum scheduling (see comment for relation_vacuum
+ * callback).
+ */
+ bool (*scan_analyze_next_tuple) (TableScanDesc scan,
+ TransactionId OldestXmin,
+ double *liverows,
+ double *deadrows,
+ TupleTableSlot *slot);
+
/* see table_index_build_range_scan for reference about parameters */
double (*index_build_range_scan) (Relation heap_rel,
Relation index_rel,
tups_recently_dead);
}
+/*
+ * Perform VACUUM on the relation. The VACUUM can be user triggered or by
+ * autovacuum. The specific actions performed by the AM will depend heavily on
+ * the individual AM.
+
+ * On entry a transaction needs to already been established, and the
+ * transaction is locked with a ShareUpdateExclusive lock.
+ *
+ * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this
+ * routine, even if (in the latter case), part of the same VACUUM command.
+ */
+static inline void
+table_relation_vacuum(Relation rel, struct VacuumParams *params,
+ BufferAccessStrategy bstrategy)
+{
+ rel->rd_tableam->relation_vacuum(rel, params, bstrategy);
+}
+
+/*
+ * Prepare to analyze block `blockno` of `scan`. The scan needs to have been
+ * started with table_beginscan_analyze(). Note that this routine might
+ * acquire resources like locks that are held until
+ * table_scan_analyze_next_tuple() returns false.
+ *
+ * Returns false if block is unsuitable for sampling, true otherwise.
+ */
+static inline bool
+table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
+ BufferAccessStrategy bstrategy)
+{
+ return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno,
+ bstrategy);
+}
+
+/*
+ * Iterate over tuples tuples in the block selected with
+ * table_scan_analyze_next_block() (which needs to have returned true, and
+ * this routine may not have returned false for the same block before). If a
+ * tuple that's suitable for sampling is found, true is returned and a tuple
+ * is stored in `slot`.
+ *
+ * *liverows and *deadrows are incremented according to the encountered
+ * tuples.
+ */
+static inline bool
+table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
+ double *liverows, double *deadrows,
+ TupleTableSlot *slot)
+{
+ return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin,
+ liverows, deadrows,
+ slot);
+}
+
/*
* table_index_build_range_scan - scan the table to find tuples to be indexed
*