From: Tom Lane Date: Fri, 26 Aug 2005 20:07:17 +0000 (+0000) Subject: Back-patch fixes for problems with VACUUM destroying t_ctid chains too soon, X-Git-Tag: REL7_3_11~3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=26f1202ca318459753a39b2ced5cb6ea9cd8ab8d;p=postgresql Back-patch fixes for problems with VACUUM destroying t_ctid chains too soon, and with insufficient paranoia in code that follows t_ctid links. This patch covers the 7.3 branch. --- diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b4b20c679b..f12a76f497 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.149.2.1 2004/10/13 22:22:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.149.2.2 2005/08/26 20:07:15 tgl Exp $ * * * INTERFACE ROUTINES @@ -1014,89 +1014,136 @@ heap_fetch(Relation relation, /* * heap_get_latest_tid - get the latest tid of a specified tuple + * + * Actually, this gets the latest version that is visible according to + * the passed snapshot. You can pass SnapshotDirty to get the very latest, + * possibly uncommitted version. + * + * *tid is both an input and an output parameter: it is updated to + * show the latest version of the row. Note that it will not be changed + * if no version of the row passes the snapshot test. */ -ItemPointer +void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid) { - ItemId lp = NULL; - Buffer buffer; - PageHeader dp; - OffsetNumber offnum; - HeapTupleData tp; - HeapTupleHeader t_data; + BlockNumber blk; ItemPointerData ctid; - bool invalidBlock, - linkend, - valid; + TransactionId priorXmax; + + /* this is to avoid Assert failures on bad input */ + if (!ItemPointerIsValid(tid)) + return; /* - * get the buffer from the relation descriptor Note that this does a - * buffer pin. + * Since this can be called with user-supplied TID, don't trust the + * input too much. (RelationGetNumberOfBlocks is an expensive check, + * so we don't check t_ctid links again this way. Note that it would + * not do to call it just once and save the result, either.) */ - - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); - - if (!BufferIsValid(buffer)) - elog(ERROR, "heap_get_latest_tid: %s relation: ReadBuffer(%lx) failed", - RelationGetRelationName(relation), (long) tid); - - LockBuffer(buffer, BUFFER_LOCK_SHARE); + blk = ItemPointerGetBlockNumber(tid); + if (blk >= RelationGetNumberOfBlocks(relation)) + elog(ERROR, "block number %u is out of range for relation \"%s\"", + blk, RelationGetRelationName(relation)); /* - * get the item line pointer corresponding to the requested tid + * Loop to chase down t_ctid links. At top of loop, ctid is the + * tuple we need to examine, and *tid is the TID we will return if + * ctid turns out to be bogus. + * + * Note that we will loop until we reach the end of the t_ctid chain. + * Depending on the snapshot passed, there might be at most one visible + * version of the row, but we don't try to optimize for that. */ - dp = (PageHeader) BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(tid); - invalidBlock = true; - if (!PageIsNew(dp)) - { - lp = PageGetItemId(dp, offnum); - if (ItemIdIsUsed(lp)) - invalidBlock = false; - } - if (invalidBlock) + ctid = *tid; + priorXmax = InvalidTransactionId; /* cannot check first XMIN */ + for (;;) { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); - return NULL; - } + Buffer buffer; + PageHeader dp; + OffsetNumber offnum; + ItemId lp; + HeapTupleData tp; + bool valid; - /* - * more sanity checks - */ + /* + * Read, pin, and lock the page. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); - tp.t_datamcxt = NULL; - t_data = tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); - tp.t_len = ItemIdGetLength(lp); - tp.t_self = *tid; - ctid = tp.t_data->t_ctid; + if (!BufferIsValid(buffer)) + elog(ERROR, "ReadBuffer(\"%s\", %lu) failed", + RelationGetRelationName(relation), + (unsigned long) ItemPointerGetBlockNumber(&ctid)); - /* - * check time qualification of tid - */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + dp = (PageHeader) BufferGetPage(buffer); - HeapTupleSatisfies(&tp, relation, buffer, dp, - snapshot, 0, (ScanKey) NULL, valid); + /* + * Check for bogus item number. This is not treated as an error + * condition because it can happen while following a t_ctid link. + * We just assume that the prior tid is OK and return it unchanged. + */ + offnum = ItemPointerGetOffsetNumber(&ctid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } + lp = PageGetItemId(dp, offnum); + if (!ItemIdIsUsed(lp)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } - linkend = true; - if ((t_data->t_infomask & HEAP_XMIN_COMMITTED) != 0 && - !ItemPointerEquals(tid, &ctid)) - linkend = false; + /* OK to access the tuple */ + tp.t_self = ctid; + tp.t_datamcxt = NULL; + tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + tp.t_len = ItemIdGetLength(lp); - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); + /* + * After following a t_ctid link, we might arrive at an unrelated + * tuple. Check for XMIN match. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } - if (!valid) - { - if (linkend) - return NULL; - heap_get_latest_tid(relation, snapshot, &ctid); - *tid = ctid; - } + /* + * Check time qualification of tuple; if visible, set it as the new + * result candidate. + */ + HeapTupleSatisfies(&tp, relation, buffer, dp, + snapshot, 0, NULL, valid); + if (valid) + *tid = ctid; + + /* + * If there's a valid t_ctid link, follow it, else we're done. + */ + if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | + HEAP_MARKED_FOR_UPDATE)) || + ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } - return tid; + ctid = tp.t_data->t_ctid; + priorXmax = HeapTupleHeaderGetXmax(tp.t_data); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } /* end of loop */ } /* @@ -1264,7 +1311,8 @@ simple_heap_insert(Relation relation, HeapTuple tup) */ int heap_delete(Relation relation, ItemPointer tid, - ItemPointer ctid, CommandId cid) + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid) { ItemId lp; HeapTupleData tp; @@ -1288,11 +1336,11 @@ heap_delete(Relation relation, ItemPointer tid, dp = (PageHeader) BufferGetPage(buffer); lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid)); + tp.t_datamcxt = NULL; - tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); + tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp); tp.t_len = ItemIdGetLength(lp); tp.t_self = *tid; - tp.t_tableOid = relation->rd_id; l1: sv_infomask = tp.t_data->t_infomask; @@ -1339,7 +1387,9 @@ l1: if (result != HeapTupleMayBeUpdated) { Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated); + Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); *ctid = tp.t_data->t_ctid; + *update_xmax = HeapTupleHeaderGetXmax(tp.t_data); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); return result; @@ -1429,10 +1479,13 @@ l1: void simple_heap_delete(Relation relation, ItemPointer tid) { - ItemPointerData ctid; int result; + ItemPointerData update_ctid; + TransactionId update_xmax; - result = heap_delete(relation, tid, &ctid, GetCurrentCommandId()); + result = heap_delete(relation, tid, + &update_ctid, &update_xmax, + GetCurrentCommandId()); switch (result) { case HeapTupleSelfUpdated: @@ -1462,7 +1515,8 @@ simple_heap_delete(Relation relation, ItemPointer tid) */ int heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - ItemPointer ctid, CommandId cid) + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid) { ItemId lp; HeapTupleData oldtup; @@ -1547,7 +1601,9 @@ l2: if (result != HeapTupleMayBeUpdated) { Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated); + Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); *ctid = oldtup.t_data->t_ctid; + *update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); return result; @@ -1767,10 +1823,13 @@ l2: void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) { - ItemPointerData ctid; int result; + ItemPointerData update_ctid; + TransactionId update_xmax; - result = heap_update(relation, otid, tup, &ctid, GetCurrentCommandId()); + result = heap_update(relation, otid, tup, + &update_ctid, &update_xmax, + GetCurrentCommandId()); switch (result) { case HeapTupleSelfUpdated: @@ -1794,9 +1853,34 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) /* * heap_mark4update - mark a tuple for update + * + * Note that this acquires a buffer pin, which the caller must release. + * + * Input parameters: + * relation: relation containing tuple (caller must hold suitable lock) + * tuple->t_self: TID of tuple to lock (rest of struct need not be valid) + * cid: current command ID (used for visibility test, and stored into + * tuple's cmax if lock is successful) + * + * Output parameters: + * *tuple: all fields filled in + * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *ctid: set to tuple's t_ctid, but only in failure cases + * *update_xmax: set to tuple's xmax, but only in failure cases + * + * Function result may be: + * HeapTupleMayBeUpdated: lock was successfully acquired + * HeapTupleSelfUpdated: lock failed because tuple updated by self + * HeapTupleUpdated: lock failed because tuple updated by other xact + * + * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. + * If t_ctid is the same as t_self, the tuple was deleted; if different, the + * tuple was updated, and t_ctid is the location of the replacement tuple. + * (t_xmax is needed to verify that the replacement tuple matches.) */ int heap_mark4update(Relation relation, HeapTuple tuple, Buffer *buffer, + ItemPointer ctid, TransactionId *update_xmax, CommandId cid) { ItemPointer tid = &(tuple->t_self); @@ -1818,9 +1902,12 @@ heap_mark4update(Relation relation, HeapTuple tuple, Buffer *buffer, dp = (PageHeader) BufferGetPage(*buffer); lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsUsed(lp)); + tuple->t_datamcxt = NULL; tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(relation); l3: sv_infomask = tuple->t_data->t_infomask; @@ -1867,7 +1954,9 @@ l3: if (result != HeapTupleMayBeUpdated) { Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated); - tuple->t_self = tuple->t_data->t_ctid; + Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); + *ctid = tuple->t_data->t_ctid; + *update_xmax = HeapTupleHeaderGetXmax(tuple->t_data); LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); return result; } diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 09c593db37..125a9d2063 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/commands/trigger.c,v 1.136.2.2 2003/05/19 17:23:54 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/commands/trigger.c,v 1.136.2.3 2005/08/26 20:07:15 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1351,14 +1351,18 @@ GetTupleForTrigger(EState *estate, ResultRelInfo *relinfo, if (newSlot != NULL) { int test; + ItemPointerData update_ctid; + TransactionId update_xmax; + + *newSlot = NULL; /* * mark tuple for update */ - *newSlot = NULL; - tuple.t_self = *tid; ltrmark:; - test = heap_mark4update(relation, &tuple, &buffer, cid); + tuple.t_self = *tid; + test = heap_mark4update(relation, &tuple, &buffer, + &update_ctid, &update_xmax, cid); switch (test) { case HeapTupleSelfUpdated: @@ -1373,15 +1377,18 @@ ltrmark:; ReleaseBuffer(buffer); if (XactIsoLevel == XACT_SERIALIZABLE) elog(ERROR, "Can't serialize access due to concurrent update"); - else if (!(ItemPointerEquals(&(tuple.t_self), tid))) + else if (!ItemPointerEquals(&update_ctid, &tuple.t_self)) { - TupleTableSlot *epqslot = EvalPlanQual(estate, - relinfo->ri_RangeTableIndex, - &(tuple.t_self)); - - if (!(TupIsNull(epqslot))) + /* it was updated, so look at the updated version */ + TupleTableSlot *epqslot; + + epqslot = EvalPlanQual(estate, + relinfo->ri_RangeTableIndex, + &update_ctid, + update_xmax); + if (!TupIsNull(epqslot)) { - *tid = tuple.t_self; + *tid = update_ctid; *newSlot = epqslot; goto ltrmark; } @@ -1418,6 +1425,7 @@ ltrmark:; tuple.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); tuple.t_len = ItemIdGetLength(lp); tuple.t_self = *tid; + tuple.t_tableOid = RelationGetRelid(relation); } result = heap_copytuple(&tuple); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 5108268925..0b40fbde61 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.244 2002/10/31 19:25:29 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.244.2.1 2005/08/26 20:07:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1646,8 +1646,6 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, Buffer Cbuf = buf; bool freeCbuf = false; bool chain_move_failed = false; - Page Cpage; - ItemId Citemid; ItemPointerData Ctid; HeapTupleData tp = tuple; Size tlen = tuple_len; @@ -1671,68 +1669,85 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, break; /* out of walk-along-page loop */ } - vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData)); - num_vtmove = 0; - free_vtmove = 100; - /* * If this tuple is in the begin/middle of the chain then - * we have to move to the end of chain. + * we have to move to the end of chain. As with any + * t_ctid chase, we have to verify that each new tuple + * is really the descendant of the tuple we came from. */ while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_MARKED_FOR_UPDATE)) && !(ItemPointerEquals(&(tp.t_self), &(tp.t_data->t_ctid)))) { - Ctid = tp.t_data->t_ctid; - if (freeCbuf) - ReleaseBuffer(Cbuf); - freeCbuf = true; - Cbuf = ReadBuffer(onerel, - ItemPointerGetBlockNumber(&Ctid)); - Cpage = BufferGetPage(Cbuf); - Citemid = PageGetItemId(Cpage, - ItemPointerGetOffsetNumber(&Ctid)); - if (!ItemIdIsUsed(Citemid)) + ItemPointerData nextTid; + TransactionId priorXmax; + Buffer nextBuf; + Page nextPage; + OffsetNumber nextOffnum; + ItemId nextItemid; + HeapTupleHeader nextTdata; + + nextTid = tp.t_data->t_ctid; + priorXmax = HeapTupleHeaderGetXmax(tp.t_data); + /* assume block# is OK (see heap_fetch comments) */ + nextBuf = ReadBuffer(onerel, + ItemPointerGetBlockNumber(&nextTid)); + nextPage = BufferGetPage(nextBuf); + /* If bogus or unused slot, assume tp is end of chain */ + nextOffnum = ItemPointerGetOffsetNumber(&nextTid); + if (nextOffnum < FirstOffsetNumber || + nextOffnum > PageGetMaxOffsetNumber(nextPage)) { - /* - * This means that in the middle of chain there - * was tuple updated by older (than OldestXmin) - * xaction and this tuple is already deleted by - * me. Actually, upper part of chain should be - * removed and seems that this should be handled - * in scan_heap(), but it's not implemented at the - * moment and so we just stop shrinking here. - */ - elog(DEBUG1, "Child itemid in update-chain marked as unused - can't continue repair_frag"); - chain_move_failed = true; - break; /* out of loop to move to chain end */ + ReleaseBuffer(nextBuf); + break; } + nextItemid = PageGetItemId(nextPage, nextOffnum); + if (!ItemIdIsUsed(nextItemid)) + { + ReleaseBuffer(nextBuf); + break; + } + /* if not matching XMIN, assume tp is end of chain */ + nextTdata = (HeapTupleHeader) PageGetItem(nextPage, + nextItemid); + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata), + priorXmax)) + { + ReleaseBuffer(nextBuf); + break; + } + /* OK, switch our attention to the next tuple in chain */ tp.t_datamcxt = NULL; - tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid); - tp.t_self = Ctid; - tlen = tp.t_len = ItemIdGetLength(Citemid); - } - if (chain_move_failed) - { + tp.t_data = nextTdata; + tp.t_self = nextTid; + tlen = tp.t_len = ItemIdGetLength(nextItemid); if (freeCbuf) ReleaseBuffer(Cbuf); - pfree(vtmove); - break; /* out of walk-along-page loop */ + Cbuf = nextBuf; + freeCbuf = true; } + /* Set up workspace for planning the chain move */ + vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData)); + num_vtmove = 0; + free_vtmove = 100; + /* - * Check if all items in chain can be moved + * Now, walk backwards up the chain (towards older tuples) + * and check if all items in chain can be moved. We record + * all the moves that need to be made in the vtmove array. */ for (;;) { Buffer Pbuf; Page Ppage; ItemId Pitemid; - HeapTupleData Ptp; + HeapTupleHeader PTdata; VTupleLinkData vtld, *vtlp; + /* Identify a target page to move this tuple to */ if (to_vacpage == NULL || !enough_space(to_vacpage, tlen)) { @@ -1802,18 +1817,17 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, /* this can't happen since we saw tuple earlier: */ if (!ItemIdIsUsed(Pitemid)) elog(ERROR, "Parent itemid marked as unused"); - Ptp.t_datamcxt = NULL; - Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid); + PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid); /* ctid should not have changed since we saved it */ Assert(ItemPointerEquals(&(vtld.new_tid), - &(Ptp.t_data->t_ctid))); + &(PTdata->t_ctid))); /* - * Read above about cases when !ItemIdIsUsed(Citemid) + * Read above about cases when !ItemIdIsUsed(nextItemid) * (child item is removed)... Due to the fact that at * the moment we don't remove unuseful part of - * update-chain, it's possible to get too old parent + * update-chain, it's possible to get non-matching parent * row here. Like as in the case which caused this * problem, we stop shrinking here. I could try to * find real parent row but want not to do it because @@ -1821,7 +1835,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, * and we are too close to 6.5 release. - vadim * 06/11/99 */ - if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data), + if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata), HeapTupleHeaderGetXmin(tp.t_data)))) { ReleaseBuffer(Pbuf); @@ -1829,8 +1843,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, chain_move_failed = true; break; /* out of check-all-items loop */ } - tp.t_datamcxt = Ptp.t_datamcxt; - tp.t_data = Ptp.t_data; + tp.t_datamcxt = NULL; + tp.t_data = PTdata; tlen = tp.t_len = ItemIdGetLength(Pitemid); if (freeCbuf) ReleaseBuffer(Cbuf); @@ -1865,6 +1879,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, for (ti = 0; ti < num_vtmove; ti++) { VacPage destvacpage = vtmove[ti].vacpage; + Page Cpage; + ItemId Citemid; /* Get page to move from */ tuple.t_self = vtmove[ti].tid; @@ -1953,16 +1969,27 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, InvalidOffsetNumber, LP_USED); if (newoff == InvalidOffsetNumber) - { elog(PANIC, "moving chain: failed to add item with len = %lu to page %u", (unsigned long) tuple_len, destvacpage->blkno); - } newitemid = PageGetItemId(ToPage, newoff); + /* drop temporary copy, and point to the version on the dest page */ pfree(newtup.t_data); newtup.t_datamcxt = NULL; newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid); + ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff); + /* + * Set new tuple's t_ctid pointing to itself for last + * tuple in chain, and to next tuple in chain + * otherwise. + */ + if (!ItemPointerIsValid(&Ctid)) + newtup.t_data->t_ctid = newtup.t_self; + else + newtup.t_data->t_ctid = Ctid; + Ctid = newtup.t_self; + /* XLOG stuff */ if (!onerel->rd_istemp) { @@ -1992,17 +2019,6 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, if (destvacpage->blkno > last_move_dest_block) last_move_dest_block = destvacpage->blkno; - /* - * Set new tuple's t_ctid pointing to itself for last - * tuple in chain, and to next tuple in chain - * otherwise. - */ - if (!ItemPointerIsValid(&Ctid)) - newtup.t_data->t_ctid = newtup.t_self; - else - newtup.t_data->t_ctid = Ctid; - Ctid = newtup.t_self; - num_moved++; /* diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 622cfa9371..b6684579f1 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -27,7 +27,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/executor/execMain.c,v 1.180.2.2 2003/03/27 14:33:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/executor/execMain.c,v 1.180.2.3 2005/08/26 20:07:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1012,8 +1012,10 @@ lnext: ; foreach(l, estate->es_rowMark) { execRowMark *erm = lfirst(l); - Buffer buffer; HeapTupleData tuple; + Buffer buffer; + ItemPointerData update_ctid; + TransactionId update_xmax; TupleTableSlot *newSlot; int test; @@ -1032,6 +1034,7 @@ lnext: ; tuple.t_self = *((ItemPointer) DatumGetPointer(datum)); test = heap_mark4update(erm->relation, &tuple, &buffer, + &update_ctid, &update_xmax, estate->es_snapshot->curcid); ReleaseBuffer(buffer); switch (test) @@ -1046,11 +1049,15 @@ lnext: ; case HeapTupleUpdated: if (XactIsoLevel == XACT_SERIALIZABLE) elog(ERROR, "Can't serialize access due to concurrent update"); - if (!(ItemPointerEquals(&(tuple.t_self), - (ItemPointer) DatumGetPointer(datum)))) + if (!ItemPointerEquals(&update_ctid, + &tuple.t_self)) { - newSlot = EvalPlanQual(estate, erm->rti, &(tuple.t_self)); - if (!(TupIsNull(newSlot))) + /* updated, so look at updated version */ + newSlot = EvalPlanQual(estate, + erm->rti, + &update_ctid, + update_xmax); + if (!TupIsNull(newSlot)) { slot = newSlot; estate->es_useEvalPlan = true; @@ -1280,8 +1287,9 @@ ExecDelete(TupleTableSlot *slot, { ResultRelInfo *resultRelInfo; Relation resultRelationDesc; - ItemPointerData ctid; int result; + ItemPointerData update_ctid; + TransactionId update_xmax; /* * get information on the (current) result relation @@ -1307,7 +1315,7 @@ ExecDelete(TupleTableSlot *slot, */ ldelete:; result = heap_delete(resultRelationDesc, tupleid, - &ctid, + &update_ctid, &update_xmax, estate->es_snapshot->curcid); switch (result) { @@ -1321,14 +1329,17 @@ ldelete:; case HeapTupleUpdated: if (XactIsoLevel == XACT_SERIALIZABLE) elog(ERROR, "Can't serialize access due to concurrent update"); - else if (!(ItemPointerEquals(tupleid, &ctid))) + else if (!ItemPointerEquals(tupleid, &update_ctid)) { - TupleTableSlot *epqslot = EvalPlanQual(estate, - resultRelInfo->ri_RangeTableIndex, &ctid); + TupleTableSlot *epqslot; + epqslot = EvalPlanQual(estate, + resultRelInfo->ri_RangeTableIndex, + &update_ctid, + update_xmax); if (!TupIsNull(epqslot)) { - *tupleid = ctid; + *tupleid = update_ctid; goto ldelete; } } @@ -1376,8 +1387,9 @@ ExecUpdate(TupleTableSlot *slot, HeapTuple tuple; ResultRelInfo *resultRelInfo; Relation resultRelationDesc; - ItemPointerData ctid; int result; + ItemPointerData update_ctid; + TransactionId update_xmax; int numIndices; /* @@ -1443,7 +1455,7 @@ lreplace:; * replace the heap tuple */ result = heap_update(resultRelationDesc, tupleid, tuple, - &ctid, + &update_ctid, &update_xmax, estate->es_snapshot->curcid); switch (result) { @@ -1457,14 +1469,17 @@ lreplace:; case HeapTupleUpdated: if (XactIsoLevel == XACT_SERIALIZABLE) elog(ERROR, "Can't serialize access due to concurrent update"); - else if (!(ItemPointerEquals(tupleid, &ctid))) + else if (!(ItemPointerEquals(tupleid, &update_ctid))) { - TupleTableSlot *epqslot = EvalPlanQual(estate, - resultRelInfo->ri_RangeTableIndex, &ctid); + TupleTableSlot *epqslot; + epqslot = EvalPlanQual(estate, + resultRelInfo->ri_RangeTableIndex, + &update_ctid, + update_xmax); if (!TupIsNull(epqslot)) { - *tupleid = ctid; + *tupleid = update_ctid; tuple = ExecRemoveJunk(estate->es_junkFilter, epqslot); slot = ExecStoreTuple(tuple, estate->es_junkFilter->jf_resultSlot, @@ -1606,9 +1621,21 @@ ExecConstraints(const char *caller, ResultRelInfo *resultRelInfo, * under READ COMMITTED rules. * * See backend/executor/README for some info about how this works. + * + * estate - executor state data + * rti - rangetable index of table containing tuple + * *tid - t_ctid from the outdated tuple (ie, next updated version) + * priorXmax - t_xmax from the outdated tuple + * + * *tid is also an output parameter: it's modified to hold the TID of the + * latest version of the tuple (note this may be changed even on failure) + * + * Returns a slot containing the new candidate update/delete tuple, or + * NULL if we determine we shouldn't process the row. */ TupleTableSlot * -EvalPlanQual(EState *estate, Index rti, ItemPointer tid) +EvalPlanQual(EState *estate, Index rti, + ItemPointer tid, TransactionId priorXmax) { evalPlanQual *epq; EState *epqstate; @@ -1653,10 +1680,24 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid) { Buffer buffer; - if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, false, NULL)) + if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, true, NULL)) { - TransactionId xwait = SnapshotDirty->xmax; + /* + * If xmin isn't what we're expecting, the slot must have been + * recycled and reused for an unrelated tuple. This implies + * that the latest version of the row was deleted, so we need + * do nothing. (Should be safe to examine xmin without getting + * buffer's content lock, since xmin never changes in an existing + * tuple.) + */ + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data), + priorXmax)) + { + ReleaseBuffer(buffer); + return NULL; + } + /* otherwise xmin should not be dirty... */ if (TransactionIdIsValid(SnapshotDirty->xmin)) elog(ERROR, "EvalPlanQual: t_xmin is uncommitted ?!"); @@ -1664,11 +1705,11 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid) * If tuple is being updated by other transaction then we have * to wait for its commit/abort. */ - if (TransactionIdIsValid(xwait)) + if (TransactionIdIsValid(SnapshotDirty->xmax)) { ReleaseBuffer(buffer); - XactLockTableWait(xwait); - continue; + XactLockTableWait(SnapshotDirty->xmax); + continue; /* loop back to repeat heap_fetch */ } /* @@ -1680,22 +1721,50 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid) } /* - * Oops! Invalid tuple. Have to check is it updated or deleted. - * Note that it's possible to get invalid SnapshotDirty->tid if - * tuple updated by this transaction. Have we to check this ? + * If the referenced slot was actually empty, the latest version + * of the row must have been deleted, so we need do nothing. + */ + if (tuple.t_data == NULL) + { + ReleaseBuffer(buffer); + return NULL; + } + + /* + * As above, if xmin isn't what we're expecting, do nothing. */ - if (ItemPointerIsValid(&(SnapshotDirty->tid)) && - !(ItemPointerEquals(&(tuple.t_self), &(SnapshotDirty->tid)))) + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data), + priorXmax)) { - /* updated, so look at the updated copy */ - tuple.t_self = SnapshotDirty->tid; - continue; + ReleaseBuffer(buffer); + return NULL; } /* - * Deleted or updated by this transaction; forget it. + * If we get here, the tuple was found but failed SnapshotDirty. + * Assuming the xmin is either a committed xact or our own xact + * (as it certainly should be if we're trying to modify the tuple), + * this must mean that the row was updated or deleted by either + * a committed xact or our own xact. If it was deleted, we can + * ignore it; if it was updated then chain up to the next version + * and repeat the whole test. + * + * As above, it should be safe to examine xmax and t_ctid without + * the buffer content lock, because they can't be changing. */ - return NULL; + if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid)) + { + /* deleted, so forget about it */ + ReleaseBuffer(buffer); + return NULL; + } + + /* updated, so look at the updated row */ + tuple.t_self = tuple.t_data->t_ctid; + /* updated row should have xmin matching this xmax */ + priorXmax = HeapTupleHeaderGetXmax(tuple.t_data); + ReleaseBuffer(buffer); + /* loop back to fetch next in chain */ } /* diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index fb9323d9e8..25e7af0872 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -30,7 +30,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/time/tqual.c,v 1.61.2.1 2005/05/07 21:23:24 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/time/tqual.c,v 1.61.2.2 2005/08/26 20:07:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -931,10 +931,13 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin) HeapTupleHeaderGetXmax(tuple))) { /* - * inserter also deleted it, so it was never visible to anyone - * else + * Inserter also deleted it, so it was never visible to anyone + * else. However, we can only remove it early if it's not an + * updated tuple; else its parent tuple is linking to it via t_ctid, + * and this tuple mustn't go away before the parent does. */ - return HEAPTUPLE_DEAD; + if (!(tuple->t_infomask & HEAP_UPDATED)) + return HEAPTUPLE_DEAD; } if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin)) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 61c9526808..9241afff8a 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: heapam.h,v 1.79 2002/09/04 20:31:36 momjian Exp $ + * $Id: heapam.h,v 1.79.2.1 2005/08/26 20:07:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -156,17 +156,21 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf, PgStat_Info *pgstat_info); -extern ItemPointer heap_get_latest_tid(Relation relation, Snapshot snapshot, +extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid); extern void setLastTid(const ItemPointer tid); extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid); -extern int heap_delete(Relation relation, ItemPointer tid, ItemPointer ctid, - CommandId cid); -extern int heap_update(Relation relation, ItemPointer otid, HeapTuple tup, - ItemPointer ctid, CommandId cid); -extern int heap_mark4update(Relation relation, HeapTuple tup, - Buffer *userbuf, CommandId cid); +extern int heap_delete(Relation relation, ItemPointer tid, + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid); +extern int heap_update(Relation relation, ItemPointer otid, + HeapTuple newtup, + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid); +extern int heap_mark4update(Relation relation, HeapTuple tuple, + Buffer *buffer, ItemPointer ctid, + TransactionId *update_xmax, CommandId cid); extern Oid simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index a12c31bff7..734edc1ced 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: executor.h,v 1.78 2002/09/04 20:31:42 momjian Exp $ + * $Id: executor.h,v 1.78.2.1 2005/08/26 20:07:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -55,7 +55,7 @@ extern void ExecutorEnd(QueryDesc *queryDesc, EState *estate); extern void ExecConstraints(const char *caller, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern TupleTableSlot *EvalPlanQual(EState *estate, Index rti, - ItemPointer tid); + ItemPointer tid, TransactionId priorXmax); /* * prototypes from functions in execProcnode.c