*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.6 2008/01/01 19:45:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.7 2008/03/08 21:57:59 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "utils/inval.h"
+/* Working data for heap_page_prune and subroutines */
+typedef struct
+{
+ TransactionId new_prune_xid; /* new prune hint value for page */
+ int nredirected; /* numbers of entries in arrays below */
+ int ndead;
+ int nunused;
+ /* arrays that accumulate indexes of items to be changed */
+ OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
+ OffsetNumber nowdead[MaxHeapTuplesPerPage];
+ OffsetNumber nowunused[MaxHeapTuplesPerPage];
+ /* marked[i] is TRUE if item i is entered in one of the above arrays */
+ bool marked[MaxHeapTuplesPerPage + 1];
+} PruneState;
+
/* Local functions */
static int heap_prune_chain(Relation relation, Buffer buffer,
OffsetNumber rootoffnum,
TransactionId OldestXmin,
- OffsetNumber *redirected, int *nredirected,
- OffsetNumber *nowdead, int *ndead,
- OffsetNumber *nowunused, int *nunused,
+ PruneState *prstate,
bool redirect_move);
-static void heap_prune_record_redirect(OffsetNumber *redirected,
- int *nredirected,
- OffsetNumber offnum,
- OffsetNumber rdoffnum);
-static void heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
- OffsetNumber offnum);
-static void heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
- OffsetNumber offnum);
+static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
+static void heap_prune_record_redirect(PruneState *prstate,
+ OffsetNumber offnum, OffsetNumber rdoffnum);
+static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
+static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
/*
* Checking free space here is questionable since we aren't holding any
* lock on the buffer; in the worst case we could get a bogus answer. It's
* unlikely to be *seriously* wrong, though, since reading either pd_lower
- * or pd_upper is probably atomic. Avoiding taking a lock seems better
- * than sometimes getting a wrong answer in what is after all just a
- * heuristic estimate.
+ * or pd_upper is probably atomic. Avoiding taking a lock seems more
+ * important than sometimes getting a wrong answer in what is after all
+ * just a heuristic estimate.
*/
minfree = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
Page page = BufferGetPage(buffer);
OffsetNumber offnum,
maxoff;
- OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
- OffsetNumber nowdead[MaxHeapTuplesPerPage];
- OffsetNumber nowunused[MaxHeapTuplesPerPage];
- int nredirected = 0;
- int ndead = 0;
- int nunused = 0;
- bool page_was_full = false;
- TransactionId save_prune_xid;
-
- START_CRIT_SECTION();
-
- /*
- * Save the current pd_prune_xid and mark the page as clear of prunable
- * tuples. If we find a tuple which may soon become prunable, we shall set
- * the hint again.
- */
- save_prune_xid = ((PageHeader) page)->pd_prune_xid;
- PageClearPrunable(page);
+ PruneState prstate;
/*
- * Also clear the "page is full" flag if it is set, since there's no point
- * in repeating the prune/defrag process until something else happens to
- * the page.
+ * Our strategy is to scan the page and make lists of items to change,
+ * then apply the changes within a critical section. This keeps as
+ * much logic as possible out of the critical section, and also ensures
+ * that WAL replay will work the same as the normal case.
+ *
+ * First, initialize the new pd_prune_xid value to zero (indicating no
+ * prunable tuples). If we find any tuples which may soon become
+ * prunable, we will save the lowest relevant XID in new_prune_xid.
+ * Also initialize the rest of our working state.
*/
- if (PageIsFull(page))
- {
- PageClearFull(page);
- page_was_full = true;
- }
+ prstate.new_prune_xid = InvalidTransactionId;
+ prstate.nredirected = prstate.ndead = prstate.nunused = 0;
+ memset(prstate.marked, 0, sizeof(prstate.marked));
/* Scan the page */
maxoff = PageGetMaxOffsetNumber(page);
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
- ItemId itemid = PageGetItemId(page, offnum);
+ ItemId itemid;
+
+ /* Ignore items already processed as part of an earlier chain */
+ if (prstate.marked[offnum])
+ continue;
/* Nothing to do if slot is empty or already dead */
+ itemid = PageGetItemId(page, offnum);
if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
continue;
/* Process this item or chain of items */
ndeleted += heap_prune_chain(relation, buffer, offnum,
OldestXmin,
- redirected, &nredirected,
- nowdead, &ndead,
- nowunused, &nunused,
+ &prstate,
redirect_move);
}
- /* Have we pruned any items? */
- if (nredirected > 0 || ndead > 0 || nunused > 0)
+ /* Any error while applying the changes is critical */
+ START_CRIT_SECTION();
+
+ /* Have we found any prunable items? */
+ if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
{
/*
- * Repair page fragmentation, and update the page's hint bit about
- * whether it has free line pointers.
+ * Apply the planned item changes, then repair page fragmentation,
+ * and update the page's hint bit about whether it has free line
+ * pointers.
+ */
+ heap_page_prune_execute(relation, buffer,
+ prstate.redirected, prstate.nredirected,
+ prstate.nowdead, prstate.ndead,
+ prstate.nowunused, prstate.nunused,
+ redirect_move);
+
+ /*
+ * Update the page's pd_prune_xid field to either zero, or the lowest
+ * XID of any soon-prunable tuple.
*/
- PageRepairFragmentation((Page) page);
+ ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
+
+ /*
+ * Also clear the "page is full" flag, since there's no point in
+ * repeating the prune/defrag process until something else happens to
+ * the page.
+ */
+ PageClearFull(page);
MarkBufferDirty(buffer);
XLogRecPtr recptr;
recptr = log_heap_clean(relation, buffer,
- redirected, nredirected,
- nowdead, ndead,
- nowunused, nunused,
+ prstate.redirected, prstate.nredirected,
+ prstate.nowdead, prstate.ndead,
+ prstate.nowunused, prstate.nunused,
redirect_move);
- PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
+
PageSetLSN(BufferGetPage(buffer), recptr);
+ PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
}
}
else
{
/*
- * If we didn't prune anything, but have updated either the
- * pd_prune_xid field or the "page is full" flag, mark the buffer
- * dirty. This is treated as a non-WAL-logged hint.
+ * If we didn't prune anything, but have found a new value for the
+ * pd_prune_xid field, update it and mark the buffer dirty.
+ * This is treated as a non-WAL-logged hint.
+ *
+ * Also clear the "page is full" flag if it is set, since there's no
+ * point in repeating the prune/defrag process until something else
+ * happens to the page.
*/
- if (((PageHeader) page)->pd_prune_xid != save_prune_xid ||
- page_was_full)
+ if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
+ PageIsFull(page))
+ {
+ ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
+ PageClearFull(page);
SetBufferCommitInfoNeedsSave(buffer);
+ }
}
END_CRIT_SECTION();
* ndeleted minus ndead, because we don't want to count a now-DEAD root
* item as a deletion for this purpose.
*/
- if (report_stats && ndeleted > ndead)
- pgstat_update_heap_dead_tuples(relation, ndeleted - ndead);
+ if (report_stats && ndeleted > prstate.ndead)
+ pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);
/*
* XXX Should we update the FSM information of this page ?
*
* OldestXmin is the cutoff XID used to identify dead tuples.
*
- * Redirected items are added to the redirected[] array (two entries per
- * redirection); items set to LP_DEAD state are added to nowdead[]; and
- * items set to LP_UNUSED state are added to nowunused[]. (These arrays
- * will be used to generate a WAL record after all chains are pruned.)
+ * We don't actually change the page here, except perhaps for hint-bit updates
+ * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in
+ * prstate showing the changes to be made. Items to be redirected are added
+ * to the redirected[] array (two entries per redirection); items to be set to
+ * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
+ * state are added to nowunused[].
*
- * If redirect_move is true, we get rid of redirecting line pointers.
+ * If redirect_move is true, we intend to get rid of redirecting line pointers,
+ * not just make redirection entries.
*
- * Returns the number of tuples deleted from the page.
+ * Returns the number of tuples (to be) deleted from the page.
*/
static int
heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
TransactionId OldestXmin,
- OffsetNumber *redirected, int *nredirected,
- OffsetNumber *nowdead, int *ndead,
- OffsetNumber *nowunused, int *nunused,
+ PruneState *prstate,
bool redirect_move)
{
int ndeleted = 0;
ItemId rootlp;
HeapTupleHeader htup;
OffsetNumber latestdead = InvalidOffsetNumber,
+ redirect_target = InvalidOffsetNumber,
maxoff = PageGetMaxOffsetNumber(dp),
offnum;
OffsetNumber chainitems[MaxHeapTuplesPerPage];
* XMIN_INVALID heap-only tuples. Those might not be linked to by
* any chain, since the parent tuple might be re-updated before
* any pruning occurs. So we have to be able to reap them
- * separately from chain-pruning.
+ * separately from chain-pruning. (Note that
+ * HeapTupleHeaderIsHotUpdated will never return true for an
+ * XMIN_INVALID tuple, so this code will work even when there were
+ * sequential updates within the aborted transaction.)
*
* Note that we might first arrive at a dead heap-only tuple
* either here or while following a chain below. Whichever path
if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
{
- ItemIdSetUnused(rootlp);
- heap_prune_record_unused(nowunused, nunused, rootoffnum);
+ heap_prune_record_unused(prstate, rootoffnum);
ndeleted++;
}
if (offnum < FirstOffsetNumber || offnum > maxoff)
break;
+ /* If item is already processed, stop --- it must not be same chain */
+ if (prstate->marked[offnum])
+ break;
+
lp = PageGetItemId(dp, offnum);
+ /* Unused item obviously isn't part of the chain */
if (!ItemIdIsUsed(lp))
break;
* This tuple may soon become DEAD. Update the hint field so
* that the page is reconsidered for pruning in future.
*/
- PageSetPrunable(dp, HeapTupleHeaderGetXmax(htup));
+ heap_prune_record_prunable(prstate,
+ HeapTupleHeaderGetXmax(htup));
break;
case HEAPTUPLE_DELETE_IN_PROGRESS:
* This tuple may soon become DEAD. Update the hint field so
* that the page is reconsidered for pruning in future.
*/
- PageSetPrunable(dp, HeapTupleHeaderGetXmax(htup));
+ heap_prune_record_prunable(prstate,
+ HeapTupleHeaderGetXmax(htup));
break;
case HEAPTUPLE_LIVE:
*/
for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
{
- ItemId lp = PageGetItemId(dp, chainitems[i]);
-
- ItemIdSetUnused(lp);
- heap_prune_record_unused(nowunused, nunused, chainitems[i]);
+ heap_prune_record_unused(prstate, chainitems[i]);
ndeleted++;
}
* redirect the root to the correct chain member.
*/
if (i >= nchain)
- {
- ItemIdSetDead(rootlp);
- heap_prune_record_dead(nowdead, ndead, rootoffnum);
- }
+ heap_prune_record_dead(prstate, rootoffnum);
else
{
- ItemIdSetRedirect(rootlp, chainitems[i]);
- heap_prune_record_redirect(redirected, nredirected,
- rootoffnum,
- chainitems[i]);
+ heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
+ /* If the redirection will be a move, need more processing */
+ if (redirect_move)
+ redirect_target = chainitems[i];
}
}
else if (nchain < 2 && ItemIdIsRedirected(rootlp))
* redirect item. We can clean up by setting the redirect item to
* DEAD state.
*/
- ItemIdSetDead(rootlp);
- heap_prune_record_dead(nowdead, ndead, rootoffnum);
+ heap_prune_record_dead(prstate, rootoffnum);
+ }
+ else if (redirect_move && ItemIdIsRedirected(rootlp))
+ {
+ /*
+ * If we desire to eliminate LP_REDIRECT items by moving tuples,
+ * make a redirection entry for each redirected root item; this
+ * will cause heap_page_prune_execute to actually do the move.
+ * (We get here only when there are no DEAD tuples in the chain;
+ * otherwise the redirection entry was made above.)
+ */
+ heap_prune_record_redirect(prstate, rootoffnum, chainitems[1]);
+ redirect_target = chainitems[1];
}
/*
- * If requested, eliminate LP_REDIRECT items by moving tuples. Note that
- * if the root item is LP_REDIRECT and doesn't point to a valid follow-on
- * item, we already killed it above.
+ * If we are going to implement a redirect by moving tuples, we have
+ * to issue a cache invalidation against the redirection target tuple,
+ * because its CTID will be effectively changed by the move. It is
+ * safe to do this now, even though we might fail before reaching the
+ * actual page update, because a useless cache invalidation doesn't
+ * hurt anything. Furthermore we really really don't want to issue
+ * the inval inside the critical section, since it has a nonzero
+ * chance of causing an error (eg, due to running out of memory).
+ *
+ * XXX this is not really right because CacheInvalidateHeapTuple
+ * expects transactional semantics, and our move isn't transactional.
+ * FIXME!!
*/
- if (redirect_move && ItemIdIsRedirected(rootlp))
+ if (OffsetNumberIsValid(redirect_target))
{
- OffsetNumber firstoffnum = ItemIdGetRedirect(rootlp);
- ItemId firstlp = PageGetItemId(dp, firstoffnum);
+ ItemId firstlp = PageGetItemId(dp, redirect_target);
HeapTupleData firsttup;
Assert(ItemIdIsNormal(firstlp));
firsttup.t_len = ItemIdGetLength(firstlp);
ItemPointerSet(&firsttup.t_self,
BufferGetBlockNumber(buffer),
- firstoffnum);
+ redirect_target);
firsttup.t_tableOid = RelationGetRelid(relation);
-
- /*
- * Mark the tuple for invalidation. Needed because we're changing its
- * CTID.
- */
CacheInvalidateHeapTuple(relation, &firsttup);
-
- /*
- * Change heap-only status of the tuple because after the line pointer
- * manipulation, it's no longer a heap-only tuple, but is directly
- * pointed to by index entries.
- */
- Assert(HeapTupleIsHeapOnly(&firsttup));
- HeapTupleClearHeapOnly(&firsttup);
-
- /* Now move the item pointer */
- *rootlp = *firstlp;
- ItemIdSetUnused(firstlp);
-
- /*
- * If latestdead is valid, we have already recorded the redirection
- * above. Otherwise, do it now.
- *
- * We don't record firstlp in the nowunused[] array, since the
- * redirection entry is enough to tell heap_xlog_clean what to do.
- */
- if (!OffsetNumberIsValid(latestdead))
- heap_prune_record_redirect(redirected, nredirected, rootoffnum,
- firstoffnum);
}
return ndeleted;
}
+/* Record lowest soon-prunable XID */
+static void
+heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
+{
+ /*
+ * This should exactly match the PageSetPrunable macro. We can't store
+ * directly into the page header yet, so we update working state.
+ */
+ Assert(TransactionIdIsNormal(xid));
+ if (!TransactionIdIsValid(prstate->new_prune_xid) ||
+ TransactionIdPrecedes(xid, prstate->new_prune_xid))
+ prstate->new_prune_xid = xid;
+}
-/* Record newly-redirected item pointer */
+/* Record item pointer to be redirected */
static void
-heap_prune_record_redirect(OffsetNumber *redirected, int *nredirected,
+heap_prune_record_redirect(PruneState *prstate,
OffsetNumber offnum, OffsetNumber rdoffnum)
{
- Assert(*nredirected < MaxHeapTuplesPerPage);
- redirected[*nredirected * 2] = offnum;
- redirected[*nredirected * 2 + 1] = rdoffnum;
- (*nredirected)++;
+ Assert(prstate->nredirected < MaxHeapTuplesPerPage);
+ prstate->redirected[prstate->nredirected * 2] = offnum;
+ prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
+ prstate->nredirected++;
+ Assert(!prstate->marked[offnum]);
+ prstate->marked[offnum] = true;
+ Assert(!prstate->marked[rdoffnum]);
+ prstate->marked[rdoffnum] = true;
}
-/* Record newly-dead item pointer */
+/* Record item pointer to be marked dead */
static void
-heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
- OffsetNumber offnum)
+heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
{
- Assert(*ndead < MaxHeapTuplesPerPage);
- nowdead[*ndead] = offnum;
- (*ndead)++;
+ Assert(prstate->ndead < MaxHeapTuplesPerPage);
+ prstate->nowdead[prstate->ndead] = offnum;
+ prstate->ndead++;
+ Assert(!prstate->marked[offnum]);
+ prstate->marked[offnum] = true;
}
-/* Record newly-unused item pointer */
+/* Record item pointer to be marked unused */
static void
-heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
- OffsetNumber offnum)
+heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
{
- Assert(*nunused < MaxHeapTuplesPerPage);
- nowunused[*nunused] = offnum;
- (*nunused)++;
+ Assert(prstate->nunused < MaxHeapTuplesPerPage);
+ prstate->nowunused[prstate->nunused] = offnum;
+ prstate->nunused++;
+ Assert(!prstate->marked[offnum]);
+ prstate->marked[offnum] = true;
+}
+
+
+/*
+ * Perform the actual page changes needed by heap_page_prune.
+ * It is expected that the caller has suitable pin and lock on the
+ * buffer, and is inside a critical section.
+ *
+ * This is split out because it is also used by heap_xlog_clean()
+ * to replay the WAL record when needed after a crash. Note that the
+ * arguments are identical to those of log_heap_clean().
+ */
+void
+heap_page_prune_execute(Relation reln, Buffer buffer,
+ OffsetNumber *redirected, int nredirected,
+ OffsetNumber *nowdead, int ndead,
+ OffsetNumber *nowunused, int nunused,
+ bool redirect_move)
+{
+ Page page = (Page) BufferGetPage(buffer);
+ OffsetNumber *offnum;
+ int i;
+
+ /* Update all redirected or moved line pointers */
+ offnum = redirected;
+ for (i = 0; i < nredirected; i++)
+ {
+ OffsetNumber fromoff = *offnum++;
+ OffsetNumber tooff = *offnum++;
+ ItemId fromlp = PageGetItemId(page, fromoff);
+
+ if (redirect_move)
+ {
+ /* Physically move the "to" item to the "from" slot */
+ ItemId tolp = PageGetItemId(page, tooff);
+ HeapTupleHeader htup;
+
+ *fromlp = *tolp;
+ ItemIdSetUnused(tolp);
+
+ /*
+ * Change heap-only status of the tuple because after the line
+ * pointer manipulation, it's no longer a heap-only tuple, but is
+ * directly pointed to by index entries.
+ */
+ Assert(ItemIdIsNormal(fromlp));
+ htup = (HeapTupleHeader) PageGetItem(page, fromlp);
+ Assert(HeapTupleHeaderIsHeapOnly(htup));
+ HeapTupleHeaderClearHeapOnly(htup);
+ }
+ else
+ {
+ /* Just insert a REDIRECT link at fromoff */
+ ItemIdSetRedirect(fromlp, tooff);
+ }
+ }
+
+ /* Update all now-dead line pointers */
+ offnum = nowdead;
+ for (i = 0; i < ndead; i++)
+ {
+ OffsetNumber off = *offnum++;
+ ItemId lp = PageGetItemId(page, off);
+
+ ItemIdSetDead(lp);
+ }
+
+ /* Update all now-unused line pointers */
+ offnum = nowunused;
+ for (i = 0; i < nunused; i++)
+ {
+ OffsetNumber off = *offnum++;
+ ItemId lp = PageGetItemId(page, off);
+
+ ItemIdSetUnused(lp);
+ }
+
+ /*
+ * Finally, repair any fragmentation, and update the page's hint bit about
+ * whether it has free pointers.
+ */
+ PageRepairFragmentation(page);
}