1 /*-------------------------------------------------------------------------
4 * heap access method code
6 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.189 2005/04/30 19:03:32 tgl Exp $
15 * relation_open - open any relation by relation OID
16 * relation_openrv - open any relation specified by a RangeVar
17 * relation_close - close any relation
18 * heap_open - open a heap relation by relation OID
19 * heap_openrv - open a heap relation specified by a RangeVar
20 * heap_close - (now just a macro for relation_close)
21 * heap_beginscan - begin relation scan
22 * heap_rescan - restart a relation scan
23 * heap_endscan - end relation scan
24 * heap_getnext - retrieve next tuple in scan
25 * heap_fetch - retrieve tuple with tid
26 * heap_insert - insert tuple into a relation
27 * heap_delete - delete a tuple from a relation
28 * heap_update - replace a tuple in a relation with another tuple
29 * heap_markpos - mark scan position
30 * heap_restrpos - restore position to marked location
33 * This file contains the heap_ routines which implement
34 * the POSTGRES heap access method used for all POSTGRES
37 *-------------------------------------------------------------------------
41 #include "access/heapam.h"
42 #include "access/hio.h"
43 #include "access/multixact.h"
44 #include "access/tuptoaster.h"
45 #include "access/valid.h"
46 #include "access/xlogutils.h"
47 #include "catalog/catalog.h"
48 #include "catalog/namespace.h"
49 #include "miscadmin.h"
50 #include "storage/sinval.h"
51 #include "utils/inval.h"
52 #include "utils/relcache.h"
56 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
57 ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
60 /* ----------------------------------------------------------------
61 * heap support routines
62 * ----------------------------------------------------------------
66 * initscan - scan code common to heap_beginscan and heap_rescan
70 initscan(HeapScanDesc scan, ScanKey key)
73 * Determine the number of blocks we have to scan.
75 * It is sufficient to do this once at scan start, since any tuples added
76 * while the scan is in progress will be invisible to my transaction
79 scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
81 scan->rs_ctup.t_datamcxt = NULL;
82 scan->rs_ctup.t_data = NULL;
83 scan->rs_cbuf = InvalidBuffer;
85 /* we don't have a marked position... */
86 ItemPointerSetInvalid(&(scan->rs_mctid));
89 * copy the scan key, if appropriate
92 memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
96 * heapgettup - fetch next heap tuple
98 * routine used by heap_getnext() which does most of the
99 * real work in scanning tuples.
101 * The passed-in *buffer must be either InvalidBuffer or the pinned
102 * current page of the scan. If we have to move to another page,
103 * we will unpin this buffer (if valid). On return, *buffer is either
104 * InvalidBuffer or the ID of a pinned buffer.
108 heapgettup(Relation relation,
121 OffsetNumber lineoff;
125 tid = (tuple->t_data == NULL) ? NULL : &(tuple->t_self);
130 * check validity of arguments, here and for other functions too Note: no
131 * locking manipulations needed--this is a local function
134 if (ItemPointerIsValid(tid))
135 elog(DEBUG2, "heapgettup(%s, tid=0x%x[%d,%d], dir=%d, ...)",
136 RelationGetRelationName(relation), tid, tid->ip_blkid,
139 elog(DEBUG2, "heapgettup(%s, tid=0x%x, dir=%d, ...)",
140 RelationGetRelationName(relation), tid, dir);
142 elog(DEBUG2, "heapgettup(..., b=0x%x, nkeys=%d, key=0x%x", buffer, nkeys, key);
144 elog(DEBUG2, "heapgettup: relation(%c)=`%s', %p",
145 relation->rd_rel->relkind, RelationGetRelationName(relation),
147 #endif /* HEAPDEBUGALL */
149 if (!ItemPointerIsValid(tid))
151 Assert(!PointerIsValid(tid));
155 tuple->t_tableOid = relation->rd_id;
158 * return null immediately if relation is empty
162 if (BufferIsValid(*buffer))
163 ReleaseBuffer(*buffer);
164 *buffer = InvalidBuffer;
165 tuple->t_datamcxt = NULL;
166 tuple->t_data = NULL;
171 * calculate next starting lineoff, given scan direction
176 * ``no movement'' scan direction: refetch same tuple
180 if (BufferIsValid(*buffer))
181 ReleaseBuffer(*buffer);
182 *buffer = InvalidBuffer;
183 tuple->t_datamcxt = NULL;
184 tuple->t_data = NULL;
188 *buffer = ReleaseAndReadBuffer(*buffer,
190 ItemPointerGetBlockNumber(tid));
192 LockBuffer(*buffer, BUFFER_LOCK_SHARE);
194 dp = (Page) BufferGetPage(*buffer);
195 lineoff = ItemPointerGetOffsetNumber(tid);
196 lpp = PageGetItemId(dp, lineoff);
198 tuple->t_datamcxt = NULL;
199 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
200 tuple->t_len = ItemIdGetLength(lpp);
201 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
208 * reverse scan direction
212 page = pages - 1; /* final page */
216 page = ItemPointerGetBlockNumber(tid); /* current page */
219 Assert(page < pages);
221 *buffer = ReleaseAndReadBuffer(*buffer,
225 LockBuffer(*buffer, BUFFER_LOCK_SHARE);
227 dp = (Page) BufferGetPage(*buffer);
228 lines = PageGetMaxOffsetNumber(dp);
231 lineoff = lines; /* final offnum */
235 lineoff = /* previous offnum */
236 OffsetNumberPrev(ItemPointerGetOffsetNumber(tid));
238 /* page and lineoff now reference the physically previous tid */
243 * forward scan direction
247 page = 0; /* first page */
248 lineoff = FirstOffsetNumber; /* first offnum */
252 page = ItemPointerGetBlockNumber(tid); /* current page */
253 lineoff = /* next offnum */
254 OffsetNumberNext(ItemPointerGetOffsetNumber(tid));
257 Assert(page < pages);
259 *buffer = ReleaseAndReadBuffer(*buffer,
263 LockBuffer(*buffer, BUFFER_LOCK_SHARE);
265 dp = (Page) BufferGetPage(*buffer);
266 lines = PageGetMaxOffsetNumber(dp);
267 /* page and lineoff now reference the physically next tid */
270 /* 'dir' is now non-zero */
273 * calculate line pointer and number of remaining items to check on
276 lpp = PageGetItemId(dp, lineoff);
278 linesleft = lineoff - 1;
280 linesleft = lines - lineoff;
283 * advance the scan until we find a qualifying tuple or run out of
288 while (linesleft >= 0)
290 if (ItemIdIsUsed(lpp))
294 tuple->t_datamcxt = NULL;
295 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
296 tuple->t_len = ItemIdGetLength(lpp);
297 ItemPointerSet(&(tuple->t_self), page, lineoff);
300 * if current tuple qualifies, return it.
302 HeapTupleSatisfies(tuple, relation, *buffer, (PageHeader) dp,
303 snapshot, nkeys, key, valid);
306 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
312 * otherwise move to the next item on the page
317 --lpp; /* move back in this page's ItemId array */
322 ++lpp; /* move forward in this page's ItemId
329 * if we get here, it means we've exhausted the items on this page
330 * and it's time to move to the next.
332 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
335 * return NULL if we've exhausted all the pages
337 if ((dir < 0) ? (page == 0) : (page + 1 >= pages))
339 if (BufferIsValid(*buffer))
340 ReleaseBuffer(*buffer);
341 *buffer = InvalidBuffer;
342 tuple->t_datamcxt = NULL;
343 tuple->t_data = NULL;
347 page = (dir < 0) ? (page - 1) : (page + 1);
349 Assert(page < pages);
351 *buffer = ReleaseAndReadBuffer(*buffer,
355 LockBuffer(*buffer, BUFFER_LOCK_SHARE);
356 dp = (Page) BufferGetPage(*buffer);
357 lines = PageGetMaxOffsetNumber((Page) dp);
358 linesleft = lines - 1;
362 lpp = PageGetItemId(dp, lines);
366 lineoff = FirstOffsetNumber;
367 lpp = PageGetItemId(dp, FirstOffsetNumber);
373 #if defined(DISABLE_COMPLEX_MACRO)
375 * This is formatted so oddly so that the correspondence to the macro
376 * definition in access/heapam.h is maintained.
379 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
385 ((isnull) ? (*(isnull) = false) : (dummyret) NULL),
386 HeapTupleNoNulls(tup) ?
388 (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ?
390 fetchatt((tupleDesc)->attrs[(attnum) - 1],
391 (char *) (tup)->t_data + (tup)->t_data->t_hoff +
392 (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
395 nocachegetattr((tup), (attnum), (tupleDesc), (isnull))
399 att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
401 ((isnull) ? (*(isnull) = true) : (dummyret) NULL),
406 nocachegetattr((tup), (attnum), (tupleDesc), (isnull))
416 #endif /* defined(DISABLE_COMPLEX_MACRO) */
419 /* ----------------------------------------------------------------
420 * heap access method interface
421 * ----------------------------------------------------------------
425 * relation_open - open any relation by relation OID
427 * If lockmode is not "NoLock", the specified kind of lock is
428 * obtained on the relation. (Generally, NoLock should only be
429 * used if the caller knows it has some appropriate lock on the
432 * An error is raised if the relation does not exist.
434 * NB: a "relation" is anything with a pg_class entry. The caller is
435 * expected to check whether the relkind is something it can handle.
439 relation_open(Oid relationId, LOCKMODE lockmode)
443 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
445 /* The relcache does all the real work... */
446 r = RelationIdGetRelation(relationId);
448 if (!RelationIsValid(r))
449 elog(ERROR, "could not open relation with OID %u", relationId);
451 if (lockmode != NoLock)
452 LockRelation(r, lockmode);
458 * conditional_relation_open - open with option not to wait
460 * As above, but if nowait is true, then throw an error rather than
461 * waiting when the lock is not immediately obtainable.
465 conditional_relation_open(Oid relationId, LOCKMODE lockmode, bool nowait)
469 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
471 /* The relcache does all the real work... */
472 r = RelationIdGetRelation(relationId);
474 if (!RelationIsValid(r))
475 elog(ERROR, "could not open relation with OID %u", relationId);
477 if (lockmode != NoLock)
481 if (!ConditionalLockRelation(r, lockmode))
483 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
484 errmsg("could not obtain lock on relation \"%s\"",
485 RelationGetRelationName(r))));
488 LockRelation(r, lockmode);
495 * relation_openrv - open any relation specified by a RangeVar
497 * As above, but the relation is specified by a RangeVar.
501 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
506 * Check for shared-cache-inval messages before trying to open the
507 * relation. This is needed to cover the case where the name
508 * identifies a rel that has been dropped and recreated since the
509 * start of our transaction: if we don't flush the old syscache entry
510 * then we'll latch onto that entry and suffer an error when we do
511 * LockRelation. Note that relation_open does not need to do this,
512 * since a relation's OID never changes.
514 * We skip this if asked for NoLock, on the assumption that the caller
515 * has already ensured some appropriate lock is held.
517 if (lockmode != NoLock)
518 AcceptInvalidationMessages();
520 /* Look up the appropriate relation using namespace search */
521 relOid = RangeVarGetRelid(relation, false);
523 /* Let relation_open do the rest */
524 return relation_open(relOid, lockmode);
528 * relation_close - close any relation
530 * If lockmode is not "NoLock", we first release the specified lock.
532 * Note that it is often sensible to hold a lock beyond relation_close;
533 * in that case, the lock is released automatically at xact end.
537 relation_close(Relation relation, LOCKMODE lockmode)
539 Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
541 if (lockmode != NoLock)
542 UnlockRelation(relation, lockmode);
544 /* The relcache does the real work... */
545 RelationClose(relation);
550 * heap_open - open a heap relation by relation OID
552 * This is essentially relation_open plus check that the relation
553 * is not an index or special relation. (The caller should also check
554 * that it's not a view before assuming it has storage.)
558 heap_open(Oid relationId, LOCKMODE lockmode)
562 r = relation_open(relationId, lockmode);
564 if (r->rd_rel->relkind == RELKIND_INDEX)
566 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
567 errmsg("\"%s\" is an index",
568 RelationGetRelationName(r))));
569 else if (r->rd_rel->relkind == RELKIND_SPECIAL)
571 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
572 errmsg("\"%s\" is a special relation",
573 RelationGetRelationName(r))));
574 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
576 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
577 errmsg("\"%s\" is a composite type",
578 RelationGetRelationName(r))));
580 pgstat_initstats(&r->pgstat_info, r);
586 * heap_openrv - open a heap relation specified
589 * As above, but relation is specified by a RangeVar.
593 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
597 r = relation_openrv(relation, lockmode);
599 if (r->rd_rel->relkind == RELKIND_INDEX)
601 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
602 errmsg("\"%s\" is an index",
603 RelationGetRelationName(r))));
604 else if (r->rd_rel->relkind == RELKIND_SPECIAL)
606 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
607 errmsg("\"%s\" is a special relation",
608 RelationGetRelationName(r))));
609 else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
611 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
612 errmsg("\"%s\" is a composite type",
613 RelationGetRelationName(r))));
615 pgstat_initstats(&r->pgstat_info, r);
622 * heap_beginscan - begin relation scan
626 heap_beginscan(Relation relation, Snapshot snapshot,
627 int nkeys, ScanKey key)
632 * increment relation ref count while scanning relation
634 * This is just to make really sure the relcache entry won't go away
635 * while the scan has a pointer to it. Caller should be holding the
636 * rel open anyway, so this is redundant in all normal scenarios...
638 RelationIncrementReferenceCount(relation);
641 * allocate and initialize scan descriptor
643 scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
645 scan->rs_rd = relation;
646 scan->rs_snapshot = snapshot;
647 scan->rs_nkeys = nkeys;
650 * we do this here instead of in initscan() because heap_rescan also
651 * calls initscan() and we don't want to allocate memory again
654 scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
658 pgstat_initstats(&scan->rs_pgstat_info, relation);
666 * heap_rescan - restart a relation scan
670 heap_rescan(HeapScanDesc scan,
676 if (BufferIsValid(scan->rs_cbuf))
677 ReleaseBuffer(scan->rs_cbuf);
680 * reinitialize scan descriptor
684 pgstat_reset_heap_scan(&scan->rs_pgstat_info);
688 * heap_endscan - end relation scan
690 * See how to integrate with index scans.
691 * Check handling if reldesc caching.
695 heap_endscan(HeapScanDesc scan)
697 /* Note: no locking manipulations needed */
702 if (BufferIsValid(scan->rs_cbuf))
703 ReleaseBuffer(scan->rs_cbuf);
706 * decrement relation reference count and free scan descriptor storage
708 RelationDecrementReferenceCount(scan->rs_rd);
717 * heap_getnext - retrieve next tuple in scan
719 * Fix to work with index relations.
720 * We don't return the buffer anymore, but you can get it from the
721 * returned HeapTuple.
726 #define HEAPDEBUG_1 \
727 elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
728 RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
729 #define HEAPDEBUG_2 \
730 elog(DEBUG2, "heap_getnext returning EOS")
731 #define HEAPDEBUG_3 \
732 elog(DEBUG2, "heap_getnext returning tuple")
737 #endif /* !defined(HEAPDEBUGALL) */
741 heap_getnext(HeapScanDesc scan, ScanDirection direction)
743 /* Note: no locking manipulations needed */
745 HEAPDEBUG_1; /* heap_getnext( info ) */
748 * Note: we depend here on the -1/0/1 encoding of ScanDirection.
750 heapgettup(scan->rs_rd,
759 if (scan->rs_ctup.t_data == NULL && !BufferIsValid(scan->rs_cbuf))
761 HEAPDEBUG_2; /* heap_getnext returning EOS */
765 pgstat_count_heap_scan(&scan->rs_pgstat_info);
768 * if we get here it means we have a new current scan tuple, so point
769 * to the proper return buffer and return the tuple.
772 HEAPDEBUG_3; /* heap_getnext returning tuple */
774 if (scan->rs_ctup.t_data != NULL)
775 pgstat_count_heap_getnext(&scan->rs_pgstat_info);
777 return ((scan->rs_ctup.t_data == NULL) ? NULL : &(scan->rs_ctup));
781 * heap_fetch - retrieve tuple with given tid
783 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
784 * the tuple, fill in the remaining fields of *tuple, and check the tuple
785 * against the specified snapshot.
787 * If successful (tuple found and passes snapshot time qual), then *userbuf
788 * is set to the buffer holding the tuple and TRUE is returned. The caller
789 * must unpin the buffer when done with the tuple.
791 * If the tuple is not found (ie, item number references a deleted slot),
792 * then tuple->t_data is set to NULL and FALSE is returned.
794 * If the tuple is found but fails the time qual check, then FALSE is returned
795 * but tuple->t_data is left pointing to the tuple.
797 * keep_buf determines what is done with the buffer in the FALSE-result cases.
798 * When the caller specifies keep_buf = true, we retain the pin on the buffer
799 * and return it in *userbuf (so the caller must eventually unpin it); when
800 * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
802 * It is somewhat inconsistent that we ereport() on invalid block number but
803 * return false on invalid item number. This is historical. The only
804 * justification I can see is that the caller can relatively easily check the
805 * block number for validity, but cannot check the item number without reading
809 heap_fetch(Relation relation,
814 PgStat_Info *pgstat_info)
816 /* Assume *userbuf is undefined on entry */
817 *userbuf = InvalidBuffer;
818 return heap_release_fetch(relation, snapshot, tuple,
819 userbuf, keep_buf, pgstat_info);
823 * heap_release_fetch - retrieve tuple with given tid
825 * This has the same API as heap_fetch except that if *userbuf is not
826 * InvalidBuffer on entry, that buffer will be released before reading
827 * the new page. This saves a separate ReleaseBuffer step and hence
828 * one entry into the bufmgr when looping through multiple fetches.
829 * Also, if *userbuf is the same buffer that holds the target tuple,
830 * we avoid bufmgr manipulation altogether.
833 heap_release_fetch(Relation relation,
838 PgStat_Info *pgstat_info)
840 ItemPointer tid = &(tuple->t_self);
848 * get the buffer from the relation descriptor. Note that this does a
849 * buffer pin, and releases the old *userbuf if not InvalidBuffer.
851 buffer = ReleaseAndReadBuffer(*userbuf, relation,
852 ItemPointerGetBlockNumber(tid));
855 * Need share lock on buffer to examine tuple commit status.
857 LockBuffer(buffer, BUFFER_LOCK_SHARE);
858 dp = (PageHeader) BufferGetPage(buffer);
861 * We'd better check for out-of-range offnum in case of VACUUM since
862 * the TID was obtained.
864 offnum = ItemPointerGetOffsetNumber(tid);
865 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
867 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
872 ReleaseBuffer(buffer);
873 *userbuf = InvalidBuffer;
875 tuple->t_datamcxt = NULL;
876 tuple->t_data = NULL;
881 * get the item line pointer corresponding to the requested tid
883 lp = PageGetItemId(dp, offnum);
886 * Must check for deleted tuple.
888 if (!ItemIdIsUsed(lp))
890 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
895 ReleaseBuffer(buffer);
896 *userbuf = InvalidBuffer;
898 tuple->t_datamcxt = NULL;
899 tuple->t_data = NULL;
904 * fill in *tuple fields
906 tuple->t_datamcxt = NULL;
907 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
908 tuple->t_len = ItemIdGetLength(lp);
909 tuple->t_tableOid = relation->rd_id;
912 * check time qualification of tuple, then release lock
914 HeapTupleSatisfies(tuple, relation, buffer, dp,
915 snapshot, 0, NULL, valid);
917 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
922 * All checks passed, so return the tuple as valid. Caller is now
923 * responsible for releasing the buffer.
928 * Count the successful fetch in *pgstat_info if given, otherwise
929 * in the relation's default statistics area.
931 if (pgstat_info != NULL)
932 pgstat_count_heap_fetch(pgstat_info);
934 pgstat_count_heap_fetch(&relation->pgstat_info);
939 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
944 ReleaseBuffer(buffer);
945 *userbuf = InvalidBuffer;
952 * heap_get_latest_tid - get the latest tid of a specified tuple
955 heap_get_latest_tid(Relation relation,
964 HeapTupleHeader t_data;
965 ItemPointerData ctid;
971 * get the buffer from the relation descriptor Note that this does a
974 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
975 LockBuffer(buffer, BUFFER_LOCK_SHARE);
978 * get the item line pointer corresponding to the requested tid
980 dp = (PageHeader) BufferGetPage(buffer);
981 offnum = ItemPointerGetOffsetNumber(tid);
985 lp = PageGetItemId(dp, offnum);
986 if (ItemIdIsUsed(lp))
987 invalidBlock = false;
991 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
992 ReleaseBuffer(buffer);
1000 tp.t_datamcxt = NULL;
1001 t_data = tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
1002 tp.t_len = ItemIdGetLength(lp);
1004 ctid = tp.t_data->t_ctid;
1007 * check time qualification of tid
1010 HeapTupleSatisfies(&tp, relation, buffer, dp,
1011 snapshot, 0, NULL, valid);
1014 if ((t_data->t_infomask & HEAP_XMIN_COMMITTED) != 0 &&
1015 !ItemPointerEquals(tid, &ctid))
1018 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1019 ReleaseBuffer(buffer);
1025 heap_get_latest_tid(relation, snapshot, &ctid);
1033 * heap_insert - insert tuple into a heap
1035 * The new tuple is stamped with current transaction ID and the specified
1039 heap_insert(Relation relation, HeapTuple tup, CommandId cid)
1041 TransactionId xid = GetCurrentTransactionId();
1044 if (relation->rd_rel->relhasoids)
1047 /* this is redundant with an Assert in HeapTupleSetOid */
1048 Assert(tup->t_data->t_infomask & HEAP_HASOID);
1052 * If the object id of this tuple has already been assigned, trust
1053 * the caller. There are a couple of ways this can happen. At
1054 * initial db creation, the backend program sets oids for tuples.
1055 * When we define an index, we set the oid. Finally, in the
1056 * future, we may allow users to set their own object ids in order
1057 * to support a persistent object store (objects need to contain
1058 * pointers to one another).
1060 if (!OidIsValid(HeapTupleGetOid(tup)))
1061 HeapTupleSetOid(tup, newoid());
1063 CheckMaxObjectId(HeapTupleGetOid(tup));
1067 /* check there is not space for an OID */
1068 Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
1071 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
1072 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
1073 HeapTupleHeaderSetXmin(tup->t_data, xid);
1074 HeapTupleHeaderSetCmin(tup->t_data, cid);
1075 HeapTupleHeaderSetXmax(tup->t_data, 0); /* zero out Datum fields */
1076 HeapTupleHeaderSetCmax(tup->t_data, 0); /* for cleanliness */
1077 tup->t_tableOid = relation->rd_id;
1080 * If the new tuple is too big for storage or contains already toasted
1081 * out-of-line attributes from some other relation, invoke the
1084 if (HeapTupleHasExternal(tup) ||
1085 (MAXALIGN(tup->t_len) > TOAST_TUPLE_THRESHOLD))
1086 heap_tuple_toast_attrs(relation, tup, NULL);
1088 /* Find buffer to insert this tuple into */
1089 buffer = RelationGetBufferForTuple(relation, tup->t_len, InvalidBuffer);
1091 /* NO EREPORT(ERROR) from here till changes are logged */
1092 START_CRIT_SECTION();
1094 RelationPutHeapTuple(relation, buffer, tup);
1096 pgstat_count_heap_insert(&relation->pgstat_info);
1099 if (!relation->rd_istemp)
1101 xl_heap_insert xlrec;
1102 xl_heap_header xlhdr;
1104 XLogRecData rdata[3];
1105 Page page = BufferGetPage(buffer);
1106 uint8 info = XLOG_HEAP_INSERT;
1108 xlrec.target.node = relation->rd_node;
1109 xlrec.target.tid = tup->t_self;
1110 rdata[0].buffer = InvalidBuffer;
1111 rdata[0].data = (char *) &xlrec;
1112 rdata[0].len = SizeOfHeapInsert;
1113 rdata[0].next = &(rdata[1]);
1115 xlhdr.t_natts = tup->t_data->t_natts;
1116 xlhdr.t_infomask = tup->t_data->t_infomask;
1117 xlhdr.t_hoff = tup->t_data->t_hoff;
1120 * note we mark rdata[1] as belonging to buffer; if XLogInsert
1121 * decides to write the whole page to the xlog, we don't need to
1122 * store xl_heap_header in the xlog.
1124 rdata[1].buffer = buffer;
1125 rdata[1].data = (char *) &xlhdr;
1126 rdata[1].len = SizeOfHeapHeader;
1127 rdata[1].next = &(rdata[2]);
1129 rdata[2].buffer = buffer;
1130 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
1131 rdata[2].data = (char *) tup->t_data + offsetof(HeapTupleHeaderData, t_bits);
1132 rdata[2].len = tup->t_len - offsetof(HeapTupleHeaderData, t_bits);
1133 rdata[2].next = NULL;
1136 * If this is the single and first tuple on page, we can reinit
1137 * the page instead of restoring the whole thing. Set flag, and
1138 * hide buffer references from XLogInsert.
1140 if (ItemPointerGetOffsetNumber(&(tup->t_self)) == FirstOffsetNumber &&
1141 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
1143 info |= XLOG_HEAP_INIT_PAGE;
1144 rdata[1].buffer = rdata[2].buffer = InvalidBuffer;
1147 recptr = XLogInsert(RM_HEAP_ID, info, rdata);
1149 PageSetLSN(page, recptr);
1150 PageSetTLI(page, ThisTimeLineID);
1154 /* No XLOG record, but still need to flag that XID exists on disk */
1155 MyXactMadeTempRelUpdate = true;
1160 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1161 WriteBuffer(buffer);
1164 * If tuple is cachable, mark it for invalidation from the caches in
1165 * case we abort. Note it is OK to do this after WriteBuffer releases
1166 * the buffer, because the "tup" data structure is all in local
1167 * memory, not in the shared buffer.
1169 CacheInvalidateHeapTuple(relation, tup);
1171 return HeapTupleGetOid(tup);
1175 * simple_heap_insert - insert a tuple
1177 * Currently, this routine differs from heap_insert only in supplying
1178 * a default command ID. But it should be used rather than using
1179 * heap_insert directly in most places where we are modifying system catalogs.
1182 simple_heap_insert(Relation relation, HeapTuple tup)
1184 return heap_insert(relation, tup, GetCurrentCommandId());
1188 * heap_delete - delete a tuple
1190 * NB: do not call this directly unless you are prepared to deal with
1191 * concurrent-update conditions. Use simple_heap_delete instead.
1193 * relation - table to be modified
1194 * tid - TID of tuple to be deleted
1195 * ctid - output parameter, used only for failure case (see below)
1196 * cid - delete command ID to use in verifying tuple visibility
1197 * crosscheck - if not InvalidSnapshot, also check tuple against this
1198 * wait - true if should wait for any conflicting update to commit/abort
1200 * Normal, successful return value is HeapTupleMayBeUpdated, which
1201 * actually means we did delete it. Failure return codes are
1202 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
1203 * (the last only possible if wait == false). On a failure return,
1204 * *ctid is set to the ctid link of the target tuple (possibly a later
1205 * version of the row).
1208 heap_delete(Relation relation, ItemPointer tid,
1209 ItemPointer ctid, CommandId cid,
1210 Snapshot crosscheck, bool wait)
1213 TransactionId xid = GetCurrentTransactionId();
1218 bool have_tuple_lock = false;
1220 Assert(ItemPointerIsValid(tid));
1222 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1223 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1225 dp = (PageHeader) BufferGetPage(buffer);
1226 lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
1227 tp.t_datamcxt = NULL;
1228 tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
1229 tp.t_len = ItemIdGetLength(lp);
1231 tp.t_tableOid = relation->rd_id;
1234 result = HeapTupleSatisfiesUpdate(tp.t_data, cid, buffer);
1236 if (result == HeapTupleInvisible)
1238 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1239 ReleaseBuffer(buffer);
1240 elog(ERROR, "attempted to delete invisible tuple");
1242 else if (result == HeapTupleBeingUpdated && wait)
1244 TransactionId xwait;
1247 /* must copy state data before unlocking buffer */
1248 xwait = HeapTupleHeaderGetXmax(tp.t_data);
1249 infomask = tp.t_data->t_infomask;
1251 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1254 * Acquire tuple lock to establish our priority for the tuple
1255 * (see heap_lock_tuple). LockTuple will release us when we are
1256 * next-in-line for the tuple.
1258 * If we are forced to "start over" below, we keep the tuple lock;
1259 * this arranges that we stay at the head of the line while
1260 * rechecking tuple state.
1262 if (!have_tuple_lock)
1264 LockTuple(relation, &(tp.t_self), ExclusiveLock);
1265 have_tuple_lock = true;
1269 * Sleep until concurrent transaction ends. Note that we don't care
1270 * if the locker has an exclusive or shared lock, because we need
1274 if (infomask & HEAP_XMAX_IS_MULTI)
1276 /* wait for multixact */
1277 MultiXactIdWait((MultiXactId) xwait);
1278 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1281 * If xwait had just locked the tuple then some other xact could
1282 * update this tuple before we get to this point. Check for xmax
1283 * change, and start over if so.
1285 if (!(tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
1286 !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
1291 * You might think the multixact is necessarily done here, but
1292 * not so: it could have surviving members, namely our own xact
1293 * or other subxacts of this backend. It is legal for us to
1294 * delete the tuple in either case, however (the latter case is
1295 * essentially a situation of upgrading our former shared lock
1296 * to exclusive). We don't bother changing the on-disk hint bits
1297 * since we are about to overwrite the xmax altogether.
1302 /* wait for regular transaction to end */
1303 XactLockTableWait(xwait);
1304 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1307 * xwait is done, but if xwait had just locked the tuple then some
1308 * other xact could update this tuple before we get to this point.
1309 * Check for xmax change, and start over if so.
1311 if ((tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
1312 !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
1316 /* Otherwise we can mark it committed or aborted */
1317 if (!(tp.t_data->t_infomask & (HEAP_XMAX_COMMITTED |
1318 HEAP_XMAX_INVALID)))
1320 if (TransactionIdDidCommit(xwait))
1321 tp.t_data->t_infomask |= HEAP_XMAX_COMMITTED;
1323 tp.t_data->t_infomask |= HEAP_XMAX_INVALID;
1324 SetBufferCommitInfoNeedsSave(buffer);
1329 * We may overwrite if previous xmax aborted, or if it committed
1330 * but only locked the tuple without updating it.
1332 if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
1334 result = HeapTupleMayBeUpdated;
1336 result = HeapTupleUpdated;
1339 if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
1341 /* Perform additional check for serializable RI updates */
1342 if (!HeapTupleSatisfiesSnapshot(tp.t_data, crosscheck, buffer))
1343 result = HeapTupleUpdated;
1346 if (result != HeapTupleMayBeUpdated)
1348 Assert(result == HeapTupleSelfUpdated ||
1349 result == HeapTupleUpdated ||
1350 result == HeapTupleBeingUpdated);
1351 *ctid = tp.t_data->t_ctid;
1352 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1353 ReleaseBuffer(buffer);
1354 if (have_tuple_lock)
1355 UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
1359 START_CRIT_SECTION();
1361 /* store transaction information of xact deleting the tuple */
1362 tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
1364 HEAP_XMAX_IS_MULTI |
1367 HeapTupleHeaderSetXmax(tp.t_data, xid);
1368 HeapTupleHeaderSetCmax(tp.t_data, cid);
1369 /* Make sure there is no forward chain link in t_ctid */
1370 tp.t_data->t_ctid = tp.t_self;
1373 if (!relation->rd_istemp)
1375 xl_heap_delete xlrec;
1377 XLogRecData rdata[2];
1379 xlrec.target.node = relation->rd_node;
1380 xlrec.target.tid = tp.t_self;
1381 rdata[0].buffer = InvalidBuffer;
1382 rdata[0].data = (char *) &xlrec;
1383 rdata[0].len = SizeOfHeapDelete;
1384 rdata[0].next = &(rdata[1]);
1386 rdata[1].buffer = buffer;
1387 rdata[1].data = NULL;
1389 rdata[1].next = NULL;
1391 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE, rdata);
1393 PageSetLSN(dp, recptr);
1394 PageSetTLI(dp, ThisTimeLineID);
1398 /* No XLOG record, but still need to flag that XID exists on disk */
1399 MyXactMadeTempRelUpdate = true;
1404 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1407 * If the tuple has toasted out-of-line attributes, we need to delete
1408 * those items too. We have to do this before WriteBuffer because we
1409 * need to look at the contents of the tuple, but it's OK to release
1410 * the context lock on the buffer first.
1412 if (HeapTupleHasExternal(&tp))
1413 heap_tuple_toast_attrs(relation, NULL, &tp);
1415 pgstat_count_heap_delete(&relation->pgstat_info);
1418 * Mark tuple for invalidation from system caches at next command
1419 * boundary. We have to do this before WriteBuffer because we need to
1420 * look at the contents of the tuple, so we need to hold our refcount
1423 CacheInvalidateHeapTuple(relation, &tp);
1425 WriteBuffer(buffer);
1428 * Release the lmgr tuple lock, if we had it.
1430 if (have_tuple_lock)
1431 UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
1433 return HeapTupleMayBeUpdated;
1437 * simple_heap_delete - delete a tuple
1439 * This routine may be used to delete a tuple when concurrent updates of
1440 * the target tuple are not expected (for example, because we have a lock
1441 * on the relation associated with the tuple). Any failure is reported
1445 simple_heap_delete(Relation relation, ItemPointer tid)
1447 ItemPointerData ctid;
1450 result = heap_delete(relation, tid,
1452 GetCurrentCommandId(), InvalidSnapshot,
1453 true /* wait for commit */ );
1456 case HeapTupleSelfUpdated:
1457 /* Tuple was already updated in current command? */
1458 elog(ERROR, "tuple already updated by self");
1461 case HeapTupleMayBeUpdated:
1462 /* done successfully */
1465 case HeapTupleUpdated:
1466 elog(ERROR, "tuple concurrently updated");
1470 elog(ERROR, "unrecognized heap_delete status: %u", result);
1476 * heap_update - replace a tuple
1478 * NB: do not call this directly unless you are prepared to deal with
1479 * concurrent-update conditions. Use simple_heap_update instead.
1481 * relation - table to be modified
1482 * otid - TID of old tuple to be replaced
1483 * newtup - newly constructed tuple data to store
1484 * ctid - output parameter, used only for failure case (see below)
1485 * cid - update command ID to use in verifying old tuple visibility
1486 * crosscheck - if not InvalidSnapshot, also check old tuple against this
1487 * wait - true if should wait for any conflicting update to commit/abort
1489 * Normal, successful return value is HeapTupleMayBeUpdated, which
1490 * actually means we *did* update it. Failure return codes are
1491 * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
1492 * (the last only possible if wait == false). On a failure return,
1493 * *ctid is set to the ctid link of the old tuple (possibly a later
1494 * version of the row).
1495 * On success, newtup->t_self is set to the TID where the new tuple
1499 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
1500 ItemPointer ctid, CommandId cid,
1501 Snapshot crosscheck, bool wait)
1504 TransactionId xid = GetCurrentTransactionId();
1506 HeapTupleData oldtup;
1514 bool have_tuple_lock = false;
1516 Assert(ItemPointerIsValid(otid));
1518 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
1519 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1521 dp = (PageHeader) BufferGetPage(buffer);
1522 lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(otid));
1524 oldtup.t_datamcxt = NULL;
1525 oldtup.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1526 oldtup.t_len = ItemIdGetLength(lp);
1527 oldtup.t_self = *otid;
1530 * Note: beyond this point, use oldtup not otid to refer to old tuple.
1531 * otid may very well point at newtup->t_self, which we will overwrite
1532 * with the new tuple's location, so there's great risk of confusion
1533 * if we use otid anymore.
1537 result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer);
1539 if (result == HeapTupleInvisible)
1541 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1542 ReleaseBuffer(buffer);
1543 elog(ERROR, "attempted to update invisible tuple");
1545 else if (result == HeapTupleBeingUpdated && wait)
1547 TransactionId xwait;
1550 /* must copy state data before unlocking buffer */
1551 xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
1552 infomask = oldtup.t_data->t_infomask;
1554 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1557 * Acquire tuple lock to establish our priority for the tuple
1558 * (see heap_lock_tuple). LockTuple will release us when we are
1559 * next-in-line for the tuple.
1561 * If we are forced to "start over" below, we keep the tuple lock;
1562 * this arranges that we stay at the head of the line while
1563 * rechecking tuple state.
1565 if (!have_tuple_lock)
1567 LockTuple(relation, &(oldtup.t_self), ExclusiveLock);
1568 have_tuple_lock = true;
1572 * Sleep until concurrent transaction ends. Note that we don't care
1573 * if the locker has an exclusive or shared lock, because we need
1577 if (infomask & HEAP_XMAX_IS_MULTI)
1579 /* wait for multixact */
1580 MultiXactIdWait((MultiXactId) xwait);
1581 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1584 * If xwait had just locked the tuple then some other xact could
1585 * update this tuple before we get to this point. Check for xmax
1586 * change, and start over if so.
1588 if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
1589 !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
1594 * You might think the multixact is necessarily done here, but
1595 * not so: it could have surviving members, namely our own xact
1596 * or other subxacts of this backend. It is legal for us to
1597 * update the tuple in either case, however (the latter case is
1598 * essentially a situation of upgrading our former shared lock
1599 * to exclusive). We don't bother changing the on-disk hint bits
1600 * since we are about to overwrite the xmax altogether.
1605 /* wait for regular transaction to end */
1606 XactLockTableWait(xwait);
1607 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1610 * xwait is done, but if xwait had just locked the tuple then some
1611 * other xact could update this tuple before we get to this point.
1612 * Check for xmax change, and start over if so.
1614 if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
1615 !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
1619 /* Otherwise we can mark it committed or aborted */
1620 if (!(oldtup.t_data->t_infomask & (HEAP_XMAX_COMMITTED |
1621 HEAP_XMAX_INVALID)))
1623 if (TransactionIdDidCommit(xwait))
1624 oldtup.t_data->t_infomask |= HEAP_XMAX_COMMITTED;
1626 oldtup.t_data->t_infomask |= HEAP_XMAX_INVALID;
1627 SetBufferCommitInfoNeedsSave(buffer);
1632 * We may overwrite if previous xmax aborted, or if it committed
1633 * but only locked the tuple without updating it.
1635 if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
1637 result = HeapTupleMayBeUpdated;
1639 result = HeapTupleUpdated;
1642 if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
1644 /* Perform additional check for serializable RI updates */
1645 if (!HeapTupleSatisfiesSnapshot(oldtup.t_data, crosscheck, buffer))
1646 result = HeapTupleUpdated;
1649 if (result != HeapTupleMayBeUpdated)
1651 Assert(result == HeapTupleSelfUpdated ||
1652 result == HeapTupleUpdated ||
1653 result == HeapTupleBeingUpdated);
1654 *ctid = oldtup.t_data->t_ctid;
1655 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1656 ReleaseBuffer(buffer);
1657 if (have_tuple_lock)
1658 UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
1662 /* Fill in OID and transaction status data for newtup */
1663 if (relation->rd_rel->relhasoids)
1666 /* this is redundant with an Assert in HeapTupleSetOid */
1667 Assert(newtup->t_data->t_infomask & HEAP_HASOID);
1669 HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
1673 /* check there is not space for an OID */
1674 Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
1677 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
1678 newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
1679 HeapTupleHeaderSetXmin(newtup->t_data, xid);
1680 HeapTupleHeaderSetCmin(newtup->t_data, cid);
1681 HeapTupleHeaderSetXmax(newtup->t_data, 0); /* zero out Datum fields */
1682 HeapTupleHeaderSetCmax(newtup->t_data, 0); /* for cleanliness */
1685 * If the toaster needs to be activated, OR if the new tuple will not
1686 * fit on the same page as the old, then we need to release the
1687 * context lock (but not the pin!) on the old tuple's buffer while we
1688 * are off doing TOAST and/or table-file-extension work. We must mark
1689 * the old tuple to show that it's already being updated, else other
1690 * processes may try to update it themselves.
1692 * We need to invoke the toaster if there are already any out-of-line
1693 * toasted values present, or if the new tuple is over-threshold.
1695 need_toast = (HeapTupleHasExternal(&oldtup) ||
1696 HeapTupleHasExternal(newtup) ||
1697 (MAXALIGN(newtup->t_len) > TOAST_TUPLE_THRESHOLD));
1699 newtupsize = MAXALIGN(newtup->t_len);
1700 pagefree = PageGetFreeSpace((Page) dp);
1702 if (need_toast || newtupsize > pagefree)
1704 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
1706 HEAP_XMAX_IS_MULTI |
1709 HeapTupleHeaderSetXmax(oldtup.t_data, xid);
1710 HeapTupleHeaderSetCmax(oldtup.t_data, cid);
1711 already_marked = true;
1712 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1714 /* Let the toaster do its thing */
1717 heap_tuple_toast_attrs(relation, newtup, &oldtup);
1718 newtupsize = MAXALIGN(newtup->t_len);
1722 * Now, do we need a new page for the tuple, or not? This is a
1723 * bit tricky since someone else could have added tuples to the
1724 * page while we weren't looking. We have to recheck the
1725 * available space after reacquiring the buffer lock. But don't
1726 * bother to do that if the former amount of free space is still
1727 * not enough; it's unlikely there's more free now than before.
1729 * What's more, if we need to get a new page, we will need to acquire
1730 * buffer locks on both old and new pages. To avoid deadlock
1731 * against some other backend trying to get the same two locks in
1732 * the other order, we must be consistent about the order we get
1733 * the locks in. We use the rule "lock the lower-numbered page of
1734 * the relation first". To implement this, we must do
1735 * RelationGetBufferForTuple while not holding the lock on the old
1736 * page, and we must rely on it to get the locks on both pages in
1737 * the correct order.
1739 if (newtupsize > pagefree)
1741 /* Assume there's no chance to put newtup on same page. */
1742 newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
1747 /* Re-acquire the lock on the old tuple's page. */
1748 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1749 /* Re-check using the up-to-date free space */
1750 pagefree = PageGetFreeSpace((Page) dp);
1751 if (newtupsize > pagefree)
1754 * Rats, it doesn't fit anymore. We must now unlock and
1755 * relock to avoid deadlock. Fortunately, this path
1756 * should seldom be taken.
1758 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1759 newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
1764 /* OK, it fits here, so we're done. */
1771 /* No TOAST work needed, and it'll fit on same page */
1772 already_marked = false;
1776 pgstat_count_heap_update(&relation->pgstat_info);
1779 * At this point newbuf and buffer are both pinned and locked, and
1780 * newbuf has enough space for the new tuple. If they are the same
1781 * buffer, only one pin is held.
1784 /* NO EREPORT(ERROR) from here till changes are logged */
1785 START_CRIT_SECTION();
1787 RelationPutHeapTuple(relation, newbuf, newtup); /* insert new tuple */
1789 if (!already_marked)
1791 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
1793 HEAP_XMAX_IS_MULTI |
1796 HeapTupleHeaderSetXmax(oldtup.t_data, xid);
1797 HeapTupleHeaderSetCmax(oldtup.t_data, cid);
1800 /* record address of new tuple in t_ctid of old one */
1801 oldtup.t_data->t_ctid = newtup->t_self;
1804 if (!relation->rd_istemp)
1806 XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self,
1807 newbuf, newtup, false);
1809 if (newbuf != buffer)
1811 PageSetLSN(BufferGetPage(newbuf), recptr);
1812 PageSetTLI(BufferGetPage(newbuf), ThisTimeLineID);
1814 PageSetLSN(BufferGetPage(buffer), recptr);
1815 PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
1819 /* No XLOG record, but still need to flag that XID exists on disk */
1820 MyXactMadeTempRelUpdate = true;
1825 if (newbuf != buffer)
1826 LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
1827 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1830 * Mark old tuple for invalidation from system caches at next command
1831 * boundary. We have to do this before WriteBuffer because we need to
1832 * look at the contents of the tuple, so we need to hold our refcount.
1834 CacheInvalidateHeapTuple(relation, &oldtup);
1836 if (newbuf != buffer)
1837 WriteBuffer(newbuf);
1838 WriteBuffer(buffer);
1841 * If new tuple is cachable, mark it for invalidation from the caches
1842 * in case we abort. Note it is OK to do this after WriteBuffer
1843 * releases the buffer, because the "newtup" data structure is all in
1844 * local memory, not in the shared buffer.
1846 CacheInvalidateHeapTuple(relation, newtup);
1849 * Release the lmgr tuple lock, if we had it.
1851 if (have_tuple_lock)
1852 UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
1854 return HeapTupleMayBeUpdated;
1858 * simple_heap_update - replace a tuple
1860 * This routine may be used to update a tuple when concurrent updates of
1861 * the target tuple are not expected (for example, because we have a lock
1862 * on the relation associated with the tuple). Any failure is reported
1866 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
1868 ItemPointerData ctid;
1871 result = heap_update(relation, otid, tup,
1873 GetCurrentCommandId(), InvalidSnapshot,
1874 true /* wait for commit */ );
1877 case HeapTupleSelfUpdated:
1878 /* Tuple was already updated in current command? */
1879 elog(ERROR, "tuple already updated by self");
1882 case HeapTupleMayBeUpdated:
1883 /* done successfully */
1886 case HeapTupleUpdated:
1887 elog(ERROR, "tuple concurrently updated");
1891 elog(ERROR, "unrecognized heap_update status: %u", result);
1897 * heap_lock_tuple - lock a tuple in shared or exclusive mode
1899 * NOTES: because the shared-memory lock table is of finite size, but users
1900 * could reasonably want to lock large numbers of tuples, we do not rely on
1901 * the standard lock manager to store tuple-level locks over the long term.
1902 * Instead, a tuple is marked as locked by setting the current transaction's
1903 * XID as its XMAX, and setting additional infomask bits to distinguish this
1904 * usage from the more normal case of having deleted the tuple. When
1905 * multiple transactions concurrently share-lock a tuple, the first locker's
1906 * XID is replaced in XMAX with a MultiTransactionId representing the set of
1907 * XIDs currently holding share-locks.
1909 * When it is necessary to wait for a tuple-level lock to be released, the
1910 * basic delay is provided by XactLockTableWait or MultiXactIdWait on the
1911 * contents of the tuple's XMAX. However, that mechanism will release all
1912 * waiters concurrently, so there would be a race condition as to which
1913 * waiter gets the tuple, potentially leading to indefinite starvation of
1914 * some waiters. The possibility of share-locking makes the problem much
1915 * worse --- a steady stream of share-lockers can easily block an exclusive
1916 * locker forever. To provide more reliable semantics about who gets a
1917 * tuple-level lock first, we use the standard lock manager. The protocol
1918 * for waiting for a tuple-level lock is really
1920 * XactLockTableWait()
1921 * mark tuple as locked by me
1923 * When there are multiple waiters, arbitration of who is to get the lock next
1924 * is provided by LockTuple(). However, at most one tuple-level lock will
1925 * be held or awaited per backend at any time, so we don't risk overflow
1926 * of the lock table. Note that incoming share-lockers are required to
1927 * do LockTuple as well, if there is any conflict, to ensure that they don't
1928 * starve out waiting exclusive-lockers. However, if there is not any active
1929 * conflict for a tuple, we don't incur any extra overhead.
1932 heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer,
1933 CommandId cid, LockTupleMode mode)
1936 ItemPointer tid = &(tuple->t_self);
1940 uint16 new_infomask;
1941 LOCKMODE tuple_lock_type;
1942 bool have_tuple_lock = false;
1944 tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock;
1946 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1947 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
1949 dp = (PageHeader) BufferGetPage(*buffer);
1950 lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
1951 tuple->t_datamcxt = NULL;
1952 tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
1953 tuple->t_len = ItemIdGetLength(lp);
1956 result = HeapTupleSatisfiesUpdate(tuple->t_data, cid, *buffer);
1958 if (result == HeapTupleInvisible)
1960 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
1961 ReleaseBuffer(*buffer);
1962 elog(ERROR, "attempted to lock invisible tuple");
1964 else if (result == HeapTupleBeingUpdated)
1966 TransactionId xwait;
1969 /* must copy state data before unlocking buffer */
1970 xwait = HeapTupleHeaderGetXmax(tuple->t_data);
1971 infomask = tuple->t_data->t_infomask;
1973 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
1976 * Acquire tuple lock to establish our priority for the tuple.
1977 * LockTuple will release us when we are next-in-line for the
1978 * tuple. We must do this even if we are share-locking.
1980 * If we are forced to "start over" below, we keep the tuple lock;
1981 * this arranges that we stay at the head of the line while
1982 * rechecking tuple state.
1984 if (!have_tuple_lock)
1986 LockTuple(relation, tid, tuple_lock_type);
1987 have_tuple_lock = true;
1990 if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK))
1993 * Acquiring sharelock when there's at least one sharelocker
1994 * already. We need not wait for him/them to complete.
1996 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
1999 * Make sure it's still a shared lock, else start over. (It's
2000 * OK if the ownership of the shared lock has changed, though.)
2002 if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
2005 else if (infomask & HEAP_XMAX_IS_MULTI)
2007 /* wait for multixact to end */
2008 MultiXactIdWait((MultiXactId) xwait);
2009 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
2012 * If xwait had just locked the tuple then some other xact
2013 * could update this tuple before we get to this point.
2014 * Check for xmax change, and start over if so.
2016 if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
2017 !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
2022 * You might think the multixact is necessarily done here, but
2023 * not so: it could have surviving members, namely our own xact
2024 * or other subxacts of this backend. It is legal for us to
2025 * lock the tuple in either case, however. We don't bother
2026 * changing the on-disk hint bits since we are about to
2027 * overwrite the xmax altogether.
2032 /* wait for regular transaction to end */
2033 XactLockTableWait(xwait);
2034 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
2037 * xwait is done, but if xwait had just locked the tuple then
2038 * some other xact could update this tuple before we get to
2039 * this point. Check for xmax change, and start over if so.
2041 if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
2042 !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
2046 /* Otherwise we can mark it committed or aborted */
2047 if (!(tuple->t_data->t_infomask & (HEAP_XMAX_COMMITTED |
2048 HEAP_XMAX_INVALID)))
2050 if (TransactionIdDidCommit(xwait))
2051 tuple->t_data->t_infomask |= HEAP_XMAX_COMMITTED;
2053 tuple->t_data->t_infomask |= HEAP_XMAX_INVALID;
2054 SetBufferCommitInfoNeedsSave(*buffer);
2059 * We may lock if previous xmax aborted, or if it committed
2060 * but only locked the tuple without updating it. The case where
2061 * we didn't wait because we are joining an existing shared lock
2062 * is correctly handled, too.
2064 if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
2066 result = HeapTupleMayBeUpdated;
2068 result = HeapTupleUpdated;
2071 if (result != HeapTupleMayBeUpdated)
2073 ItemPointerData newctid = tuple->t_data->t_ctid;
2075 Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
2076 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
2077 if (have_tuple_lock)
2078 UnlockTuple(relation, tid, tuple_lock_type);
2079 /* can't overwrite t_self (== *tid) until after above Unlock */
2080 tuple->t_self = newctid;
2085 * Compute the new xmax and infomask to store into the tuple. Note we
2086 * do not modify the tuple just yet, because that would leave it in the
2087 * wrong state if multixact.c elogs.
2089 xid = GetCurrentTransactionId();
2091 new_infomask = tuple->t_data->t_infomask;
2093 new_infomask &= ~(HEAP_XMAX_COMMITTED |
2095 HEAP_XMAX_IS_MULTI |
2099 if (mode == LockTupleShared)
2101 TransactionId xmax = HeapTupleHeaderGetXmax(tuple->t_data);
2102 uint16 old_infomask = tuple->t_data->t_infomask;
2105 * If this is the first acquisition of a shared lock in the current
2106 * transaction, set my per-backend OldestMemberMXactId setting.
2107 * We can be certain that the transaction will never become a
2108 * member of any older MultiXactIds than that. (We have to do this
2109 * even if we end up just using our own TransactionId below, since
2110 * some other backend could incorporate our XID into a MultiXact
2111 * immediately afterwards.)
2113 MultiXactIdSetOldestMember();
2115 new_infomask |= HEAP_XMAX_SHARED_LOCK;
2118 * Check to see if we need a MultiXactId because there are multiple
2121 * HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID
2122 * bit if the xmax was a MultiXactId but it was not running anymore.
2123 * There is a race condition, which is that the MultiXactId may have
2124 * finished since then, but that uncommon case is handled within
2125 * MultiXactIdExpand.
2127 * There is a similar race condition possible when the old xmax was
2128 * a regular TransactionId. We test TransactionIdIsInProgress again
2129 * just to narrow the window, but it's still possible to end up
2130 * creating an unnecessary MultiXactId. Fortunately this is harmless.
2132 if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED)))
2134 if (old_infomask & HEAP_XMAX_IS_MULTI)
2137 * If the XMAX is already a MultiXactId, then we need to
2138 * expand it to include our own TransactionId.
2140 xid = MultiXactIdExpand(xmax, true, xid);
2141 new_infomask |= HEAP_XMAX_IS_MULTI;
2143 else if (TransactionIdIsInProgress(xmax))
2145 if (TransactionIdEquals(xmax, xid))
2148 * If the old locker is ourselves, we'll just mark the
2149 * tuple again with our own TransactionId. However we
2150 * have to consider the possibility that we had
2151 * exclusive rather than shared lock before --- if so,
2152 * be careful to preserve the exclusivity of the lock.
2154 if (!(old_infomask & HEAP_XMAX_SHARED_LOCK))
2156 new_infomask &= ~HEAP_XMAX_SHARED_LOCK;
2157 new_infomask |= HEAP_XMAX_EXCL_LOCK;
2158 mode = LockTupleExclusive;
2164 * If the Xmax is a valid TransactionId, then we need to
2165 * create a new MultiXactId that includes both the old
2166 * locker and our own TransactionId.
2168 xid = MultiXactIdExpand(xmax, false, xid);
2169 new_infomask |= HEAP_XMAX_IS_MULTI;
2175 * Can get here iff HeapTupleSatisfiesUpdate saw the old
2176 * xmax as running, but it finished before
2177 * TransactionIdIsInProgress() got to run. Treat it like
2178 * there's no locker in the tuple.
2185 * There was no previous locker, so just insert our own
2192 /* We want an exclusive lock on the tuple */
2193 new_infomask |= HEAP_XMAX_EXCL_LOCK;
2196 START_CRIT_SECTION();
2199 * Store transaction information of xact locking the tuple.
2201 * Note: our CID is meaningless if storing a MultiXactId, but no harm
2202 * in storing it anyway.
2204 tuple->t_data->t_infomask = new_infomask;
2205 HeapTupleHeaderSetXmax(tuple->t_data, xid);
2206 HeapTupleHeaderSetCmax(tuple->t_data, cid);
2207 /* Make sure there is no forward chain link in t_ctid */
2208 tuple->t_data->t_ctid = *tid;
2211 * XLOG stuff. You might think that we don't need an XLOG record because
2212 * there is no state change worth restoring after a crash. You would be
2213 * wrong however: we have just written either a TransactionId or a
2214 * MultiXactId that may never have been seen on disk before, and we need
2215 * to make sure that there are XLOG entries covering those ID numbers.
2216 * Else the same IDs might be re-used after a crash, which would be
2217 * disastrous if this page made it to disk before the crash. Essentially
2218 * we have to enforce the WAL log-before-data rule even in this case.
2220 if (!relation->rd_istemp)
2224 XLogRecData rdata[2];
2226 xlrec.target.node = relation->rd_node;
2227 xlrec.target.tid = tuple->t_self;
2228 xlrec.shared_lock = (mode == LockTupleShared);
2229 rdata[0].buffer = InvalidBuffer;
2230 rdata[0].data = (char *) &xlrec;
2231 rdata[0].len = SizeOfHeapLock;
2232 rdata[0].next = &(rdata[1]);
2234 rdata[1].buffer = *buffer;
2235 rdata[1].data = NULL;
2237 rdata[1].next = NULL;
2239 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK, rdata);
2241 PageSetLSN(dp, recptr);
2242 PageSetTLI(dp, ThisTimeLineID);
2246 /* No XLOG record, but still need to flag that XID exists on disk */
2247 MyXactMadeTempRelUpdate = true;
2252 LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
2254 WriteNoReleaseBuffer(*buffer);
2257 * Now that we have successfully marked the tuple as locked, we can
2258 * release the lmgr tuple lock, if we had it.
2260 if (have_tuple_lock)
2261 UnlockTuple(relation, tid, tuple_lock_type);
2263 return HeapTupleMayBeUpdated;
2267 * heap_markpos - mark scan position
2271 heap_markpos(HeapScanDesc scan)
2273 /* Note: no locking manipulations needed */
2275 if (scan->rs_ctup.t_data != NULL)
2276 scan->rs_mctid = scan->rs_ctup.t_self;
2278 ItemPointerSetInvalid(&scan->rs_mctid);
2282 * heap_restrpos - restore position to marked location
2286 heap_restrpos(HeapScanDesc scan)
2288 /* XXX no amrestrpos checking that ammarkpos called */
2290 /* Note: no locking manipulations needed */
2293 * unpin scan buffers
2295 if (BufferIsValid(scan->rs_cbuf))
2296 ReleaseBuffer(scan->rs_cbuf);
2297 scan->rs_cbuf = InvalidBuffer;
2299 if (!ItemPointerIsValid(&scan->rs_mctid))
2301 scan->rs_ctup.t_datamcxt = NULL;
2302 scan->rs_ctup.t_data = NULL;
2306 scan->rs_ctup.t_self = scan->rs_mctid;
2307 scan->rs_ctup.t_datamcxt = NULL;
2308 scan->rs_ctup.t_data = (HeapTupleHeader) 0x1; /* for heapgettup */
2309 heapgettup(scan->rs_rd,
2321 log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
2323 xl_heap_clean xlrec;
2325 XLogRecData rdata[2];
2327 /* Caller should not call me on a temp relation */
2328 Assert(!reln->rd_istemp);
2330 xlrec.node = reln->rd_node;
2331 xlrec.block = BufferGetBlockNumber(buffer);
2333 rdata[0].buffer = InvalidBuffer;
2334 rdata[0].data = (char *) &xlrec;
2335 rdata[0].len = SizeOfHeapClean;
2336 rdata[0].next = &(rdata[1]);
2339 * The unused-offsets array is not actually in the buffer, but pretend
2340 * that it is. When XLogInsert stores the whole buffer, the offsets
2341 * array need not be stored too.
2343 rdata[1].buffer = buffer;
2346 rdata[1].data = (char *) unused;
2347 rdata[1].len = uncnt * sizeof(OffsetNumber);
2351 rdata[1].data = NULL;
2354 rdata[1].next = NULL;
2356 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata);
2362 log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
2363 Buffer newbuf, HeapTuple newtup, bool move)
2366 * Note: xlhdr is declared to have adequate size and correct alignment
2367 * for an xl_heap_header. However the two tids, if present at all,
2368 * will be packed in with no wasted space after the xl_heap_header;
2369 * they aren't necessarily aligned as implied by this struct
2378 int hsize = SizeOfHeapHeader;
2379 xl_heap_update xlrec;
2381 XLogRecData rdata[4];
2382 Page page = BufferGetPage(newbuf);
2383 uint8 info = (move) ? XLOG_HEAP_MOVE : XLOG_HEAP_UPDATE;
2385 /* Caller should not call me on a temp relation */
2386 Assert(!reln->rd_istemp);
2388 xlrec.target.node = reln->rd_node;
2389 xlrec.target.tid = from;
2390 xlrec.newtid = newtup->t_self;
2391 rdata[0].buffer = InvalidBuffer;
2392 rdata[0].data = (char *) &xlrec;
2393 rdata[0].len = SizeOfHeapUpdate;
2394 rdata[0].next = &(rdata[1]);
2396 rdata[1].buffer = oldbuf;
2397 rdata[1].data = NULL;
2399 rdata[1].next = &(rdata[2]);
2401 xlhdr.hdr.t_natts = newtup->t_data->t_natts;
2402 xlhdr.hdr.t_infomask = newtup->t_data->t_infomask;
2403 xlhdr.hdr.t_hoff = newtup->t_data->t_hoff;
2404 if (move) /* remember xmax & xmin */
2406 TransactionId xid[2]; /* xmax, xmin */
2408 if (newtup->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED))
2409 xid[0] = InvalidTransactionId;
2411 xid[0] = HeapTupleHeaderGetXmax(newtup->t_data);
2412 xid[1] = HeapTupleHeaderGetXmin(newtup->t_data);
2413 memcpy((char *) &xlhdr + hsize,
2415 2 * sizeof(TransactionId));
2416 hsize += 2 * sizeof(TransactionId);
2420 * As with insert records, we need not store the rdata[2] segment if
2421 * we decide to store the whole buffer instead.
2423 rdata[2].buffer = newbuf;
2424 rdata[2].data = (char *) &xlhdr;
2425 rdata[2].len = hsize;
2426 rdata[2].next = &(rdata[3]);
2428 rdata[3].buffer = newbuf;
2429 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2430 rdata[3].data = (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits);
2431 rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
2432 rdata[3].next = NULL;
2434 /* If new tuple is the single and first tuple on page... */
2435 if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
2436 PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
2438 info |= XLOG_HEAP_INIT_PAGE;
2439 rdata[2].buffer = rdata[3].buffer = InvalidBuffer;
2442 recptr = XLogInsert(RM_HEAP_ID, info, rdata);
2448 log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from,
2449 Buffer newbuf, HeapTuple newtup)
2451 return (log_heap_update(reln, oldbuf, from, newbuf, newtup, true));
2455 heap_xlog_clean(bool redo, XLogRecPtr lsn, XLogRecord *record)
2457 xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
2462 if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
2465 reln = XLogOpenRelation(redo, RM_HEAP_ID, xlrec->node);
2466 if (!RelationIsValid(reln))
2469 buffer = XLogReadBuffer(false, reln, xlrec->block);
2470 if (!BufferIsValid(buffer))
2471 elog(PANIC, "heap_clean_redo: no block");
2473 page = (Page) BufferGetPage(buffer);
2474 if (PageIsNew((PageHeader) page))
2475 elog(PANIC, "heap_clean_redo: uninitialized page");
2477 if (XLByteLE(lsn, PageGetLSN(page)))
2479 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2480 ReleaseBuffer(buffer);
2484 if (record->xl_len > SizeOfHeapClean)
2486 OffsetNumber *unused;
2487 OffsetNumber *unend;
2490 unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
2491 unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
2493 while (unused < unend)
2495 lp = PageGetItemId(page, *unused + 1);
2496 lp->lp_flags &= ~LP_USED;
2501 PageRepairFragmentation(page, NULL);
2503 PageSetLSN(page, lsn);
2504 PageSetTLI(page, ThisTimeLineID);
2505 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2506 WriteBuffer(buffer);
2510 heap_xlog_newpage(bool redo, XLogRecPtr lsn, XLogRecord *record)
2512 xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record);
2518 * Note: the NEWPAGE log record is used for both heaps and indexes, so
2519 * do not do anything that assumes we are touching a heap.
2522 if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
2525 reln = XLogOpenRelation(redo, RM_HEAP_ID, xlrec->node);
2526 if (!RelationIsValid(reln))
2528 buffer = XLogReadBuffer(true, reln, xlrec->blkno);
2529 if (!BufferIsValid(buffer))
2530 elog(PANIC, "heap_newpage_redo: no block");
2531 page = (Page) BufferGetPage(buffer);
2533 Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
2534 memcpy(page, (char *) xlrec + SizeOfHeapNewpage, BLCKSZ);
2536 PageSetLSN(page, lsn);
2537 PageSetTLI(page, ThisTimeLineID);
2538 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2539 WriteBuffer(buffer);
2543 heap_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
2545 xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
2549 OffsetNumber offnum;
2551 HeapTupleHeader htup;
2553 if (redo && (record->xl_info & XLR_BKP_BLOCK_1))
2556 reln = XLogOpenRelation(redo, RM_HEAP_ID, xlrec->target.node);
2558 if (!RelationIsValid(reln))
2561 buffer = XLogReadBuffer(false, reln,
2562 ItemPointerGetBlockNumber(&(xlrec->target.tid)));
2563 if (!BufferIsValid(buffer))
2564 elog(PANIC, "heap_delete_%sdo: no block", (redo) ? "re" : "un");
2566 page = (Page) BufferGetPage(buffer);
2567 if (PageIsNew((PageHeader) page))
2568 elog(PANIC, "heap_delete_%sdo: uninitialized page", (redo) ? "re" : "un");
2572 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
2574 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2575 ReleaseBuffer(buffer);
2579 else if (XLByteLT(PageGetLSN(page), lsn)) /* changes are not applied
2581 elog(PANIC, "heap_delete_undo: bad page LSN");
2583 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
2584 if (PageGetMaxOffsetNumber(page) >= offnum)
2585 lp = PageGetItemId(page, offnum);
2587 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsUsed(lp))
2588 elog(PANIC, "heap_delete_%sdo: invalid lp", (redo) ? "re" : "un");
2590 htup = (HeapTupleHeader) PageGetItem(page, lp);
2594 htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2596 HEAP_XMAX_IS_MULTI |
2599 HeapTupleHeaderSetXmax(htup, record->xl_xid);
2600 HeapTupleHeaderSetCmax(htup, FirstCommandId);
2601 /* Make sure there is no forward chain link in t_ctid */
2602 htup->t_ctid = xlrec->target.tid;
2603 PageSetLSN(page, lsn);
2604 PageSetTLI(page, ThisTimeLineID);
2605 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2606 WriteBuffer(buffer);
2610 elog(PANIC, "heap_delete_undo: unimplemented");
2614 heap_xlog_insert(bool redo, XLogRecPtr lsn, XLogRecord *record)
2616 xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
2620 OffsetNumber offnum;
2622 if (redo && (record->xl_info & XLR_BKP_BLOCK_1))
2625 reln = XLogOpenRelation(redo, RM_HEAP_ID, xlrec->target.node);
2627 if (!RelationIsValid(reln))
2630 buffer = XLogReadBuffer((redo) ? true : false, reln,
2631 ItemPointerGetBlockNumber(&(xlrec->target.tid)));
2632 if (!BufferIsValid(buffer))
2635 page = (Page) BufferGetPage(buffer);
2636 if (PageIsNew((PageHeader) page) &&
2637 (!redo || !(record->xl_info & XLOG_HEAP_INIT_PAGE)))
2638 elog(PANIC, "heap_insert_%sdo: uninitialized page", (redo) ? "re" : "un");
2644 HeapTupleHeaderData hdr;
2645 char data[MaxTupleSize];
2647 HeapTupleHeader htup;
2648 xl_heap_header xlhdr;
2651 if (record->xl_info & XLOG_HEAP_INIT_PAGE)
2652 PageInit(page, BufferGetPageSize(buffer), 0);
2654 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
2656 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2657 ReleaseBuffer(buffer);
2661 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
2662 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
2663 elog(PANIC, "heap_insert_redo: invalid max offset number");
2665 newlen = record->xl_len - SizeOfHeapInsert - SizeOfHeapHeader;
2666 Assert(newlen <= MaxTupleSize);
2667 memcpy((char *) &xlhdr,
2668 (char *) xlrec + SizeOfHeapInsert,
2671 MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
2672 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
2673 memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
2674 (char *) xlrec + SizeOfHeapInsert + SizeOfHeapHeader,
2676 newlen += offsetof(HeapTupleHeaderData, t_bits);
2677 htup->t_natts = xlhdr.t_natts;
2678 htup->t_infomask = xlhdr.t_infomask;
2679 htup->t_hoff = xlhdr.t_hoff;
2680 HeapTupleHeaderSetXmin(htup, record->xl_xid);
2681 HeapTupleHeaderSetCmin(htup, FirstCommandId);
2682 htup->t_ctid = xlrec->target.tid;
2684 offnum = PageAddItem(page, (Item) htup, newlen, offnum,
2685 LP_USED | OverwritePageMode);
2686 if (offnum == InvalidOffsetNumber)
2687 elog(PANIC, "heap_insert_redo: failed to add tuple");
2688 PageSetLSN(page, lsn);
2689 PageSetTLI(page, ThisTimeLineID);
2690 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2691 WriteBuffer(buffer);
2696 if (XLByteLT(PageGetLSN(page), lsn)) /* changes are not applied
2698 elog(PANIC, "heap_insert_undo: bad page LSN");
2700 elog(PANIC, "heap_insert_undo: unimplemented");
2704 * Handles UPDATE & MOVE
2707 heap_xlog_update(bool redo, XLogRecPtr lsn, XLogRecord *record, bool move)
2709 xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
2710 Relation reln = XLogOpenRelation(redo, RM_HEAP_ID, xlrec->target.node);
2713 (ItemPointerGetBlockNumber(&(xlrec->newtid)) ==
2714 ItemPointerGetBlockNumber(&(xlrec->target.tid)));
2716 OffsetNumber offnum;
2718 HeapTupleHeader htup;
2720 if (!RelationIsValid(reln))
2723 if (redo && (record->xl_info & XLR_BKP_BLOCK_1))
2726 /* Deal with old tuple version */
2728 buffer = XLogReadBuffer(false, reln,
2729 ItemPointerGetBlockNumber(&(xlrec->target.tid)));
2730 if (!BufferIsValid(buffer))
2731 elog(PANIC, "heap_update_%sdo: no block", (redo) ? "re" : "un");
2733 page = (Page) BufferGetPage(buffer);
2734 if (PageIsNew((PageHeader) page))
2735 elog(PANIC, "heap_update_%sdo: uninitialized old page", (redo) ? "re" : "un");
2739 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
2741 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2742 ReleaseBuffer(buffer);
2748 else if (XLByteLT(PageGetLSN(page), lsn)) /* changes are not applied
2750 elog(PANIC, "heap_update_undo: bad old tuple page LSN");
2752 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
2753 if (PageGetMaxOffsetNumber(page) >= offnum)
2754 lp = PageGetItemId(page, offnum);
2756 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsUsed(lp))
2757 elog(PANIC, "heap_update_%sdo: invalid lp", (redo) ? "re" : "un");
2759 htup = (HeapTupleHeader) PageGetItem(page, lp);
2765 htup->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2768 htup->t_infomask |= HEAP_MOVED_OFF;
2769 HeapTupleHeaderSetXvac(htup, record->xl_xid);
2770 /* Make sure there is no forward chain link in t_ctid */
2771 htup->t_ctid = xlrec->target.tid;
2775 htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2777 HEAP_XMAX_IS_MULTI |
2780 HeapTupleHeaderSetXmax(htup, record->xl_xid);
2781 HeapTupleHeaderSetCmax(htup, FirstCommandId);
2782 /* Set forward chain link in t_ctid */
2783 htup->t_ctid = xlrec->newtid;
2787 PageSetLSN(page, lsn);
2788 PageSetTLI(page, ThisTimeLineID);
2789 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2790 WriteBuffer(buffer);
2794 elog(PANIC, "heap_update_undo: unimplemented");
2796 /* Deal with new tuple */
2801 ((record->xl_info & XLR_BKP_BLOCK_2) ||
2802 ((record->xl_info & XLR_BKP_BLOCK_1) && samepage)))
2805 buffer = XLogReadBuffer((redo) ? true : false, reln,
2806 ItemPointerGetBlockNumber(&(xlrec->newtid)));
2807 if (!BufferIsValid(buffer))
2810 page = (Page) BufferGetPage(buffer);
2813 if (PageIsNew((PageHeader) page) &&
2814 (!redo || !(record->xl_info & XLOG_HEAP_INIT_PAGE)))
2815 elog(PANIC, "heap_update_%sdo: uninitialized page", (redo) ? "re" : "un");
2821 HeapTupleHeaderData hdr;
2822 char data[MaxTupleSize];
2824 xl_heap_header xlhdr;
2828 if (record->xl_info & XLOG_HEAP_INIT_PAGE)
2829 PageInit(page, BufferGetPageSize(buffer), 0);
2831 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
2833 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2834 ReleaseBuffer(buffer);
2838 offnum = ItemPointerGetOffsetNumber(&(xlrec->newtid));
2839 if (PageGetMaxOffsetNumber(page) + 1 < offnum)
2840 elog(PANIC, "heap_update_redo: invalid max offset number");
2842 hsize = SizeOfHeapUpdate + SizeOfHeapHeader;
2844 hsize += (2 * sizeof(TransactionId));
2846 newlen = record->xl_len - hsize;
2847 Assert(newlen <= MaxTupleSize);
2848 memcpy((char *) &xlhdr,
2849 (char *) xlrec + SizeOfHeapUpdate,
2852 MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
2853 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
2854 memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
2855 (char *) xlrec + hsize,
2857 newlen += offsetof(HeapTupleHeaderData, t_bits);
2858 htup->t_natts = xlhdr.t_natts;
2859 htup->t_infomask = xlhdr.t_infomask;
2860 htup->t_hoff = xlhdr.t_hoff;
2864 TransactionId xid[2]; /* xmax, xmin */
2866 memcpy((char *) xid,
2867 (char *) xlrec + SizeOfHeapUpdate + SizeOfHeapHeader,
2868 2 * sizeof(TransactionId));
2869 HeapTupleHeaderSetXmin(htup, xid[1]);
2870 HeapTupleHeaderSetXmax(htup, xid[0]);
2871 HeapTupleHeaderSetXvac(htup, record->xl_xid);
2875 HeapTupleHeaderSetXmin(htup, record->xl_xid);
2876 HeapTupleHeaderSetCmin(htup, FirstCommandId);
2878 /* Make sure there is no forward chain link in t_ctid */
2879 htup->t_ctid = xlrec->newtid;
2881 offnum = PageAddItem(page, (Item) htup, newlen, offnum,
2882 LP_USED | OverwritePageMode);
2883 if (offnum == InvalidOffsetNumber)
2884 elog(PANIC, "heap_update_redo: failed to add tuple");
2885 PageSetLSN(page, lsn);
2886 PageSetTLI(page, ThisTimeLineID);
2887 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2888 WriteBuffer(buffer);
2893 if (XLByteLT(PageGetLSN(page), lsn)) /* changes not applied?! */
2894 elog(PANIC, "heap_update_undo: bad new tuple page LSN");
2896 elog(PANIC, "heap_update_undo: unimplemented");
2901 heap_xlog_lock(bool redo, XLogRecPtr lsn, XLogRecord *record)
2903 xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
2907 OffsetNumber offnum;
2909 HeapTupleHeader htup;
2911 if (redo && (record->xl_info & XLR_BKP_BLOCK_1))
2914 reln = XLogOpenRelation(redo, RM_HEAP_ID, xlrec->target.node);
2916 if (!RelationIsValid(reln))
2919 buffer = XLogReadBuffer(false, reln,
2920 ItemPointerGetBlockNumber(&(xlrec->target.tid)));
2921 if (!BufferIsValid(buffer))
2922 elog(PANIC, "heap_lock_%sdo: no block", (redo) ? "re" : "un");
2924 page = (Page) BufferGetPage(buffer);
2925 if (PageIsNew((PageHeader) page))
2926 elog(PANIC, "heap_lock_%sdo: uninitialized page", (redo) ? "re" : "un");
2930 if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
2932 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2933 ReleaseBuffer(buffer);
2937 else if (XLByteLT(PageGetLSN(page), lsn)) /* changes are not applied
2939 elog(PANIC, "heap_lock_undo: bad page LSN");
2941 offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
2942 if (PageGetMaxOffsetNumber(page) >= offnum)
2943 lp = PageGetItemId(page, offnum);
2945 if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsUsed(lp))
2946 elog(PANIC, "heap_lock_%sdo: invalid lp", (redo) ? "re" : "un");
2948 htup = (HeapTupleHeader) PageGetItem(page, lp);
2953 * Presently, we don't bother to restore the locked state, but
2954 * just set the XMAX_INVALID bit.
2956 htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
2958 HEAP_XMAX_IS_MULTI |
2961 htup->t_infomask |= HEAP_XMAX_INVALID;
2962 HeapTupleHeaderSetXmax(htup, record->xl_xid);
2963 HeapTupleHeaderSetCmax(htup, FirstCommandId);
2964 /* Make sure there is no forward chain link in t_ctid */
2965 htup->t_ctid = xlrec->target.tid;
2966 PageSetLSN(page, lsn);
2967 PageSetTLI(page, ThisTimeLineID);
2968 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2969 WriteBuffer(buffer);
2973 elog(PANIC, "heap_lock_undo: unimplemented");
2977 heap_redo(XLogRecPtr lsn, XLogRecord *record)
2979 uint8 info = record->xl_info & ~XLR_INFO_MASK;
2981 info &= XLOG_HEAP_OPMASK;
2982 if (info == XLOG_HEAP_INSERT)
2983 heap_xlog_insert(true, lsn, record);
2984 else if (info == XLOG_HEAP_DELETE)
2985 heap_xlog_delete(true, lsn, record);
2986 else if (info == XLOG_HEAP_UPDATE)
2987 heap_xlog_update(true, lsn, record, false);
2988 else if (info == XLOG_HEAP_MOVE)
2989 heap_xlog_update(true, lsn, record, true);
2990 else if (info == XLOG_HEAP_CLEAN)
2991 heap_xlog_clean(true, lsn, record);
2992 else if (info == XLOG_HEAP_NEWPAGE)
2993 heap_xlog_newpage(true, lsn, record);
2994 else if (info == XLOG_HEAP_LOCK)
2995 heap_xlog_lock(true, lsn, record);
2997 elog(PANIC, "heap_redo: unknown op code %u", info);
3001 heap_undo(XLogRecPtr lsn, XLogRecord *record)
3003 uint8 info = record->xl_info & ~XLR_INFO_MASK;
3005 info &= XLOG_HEAP_OPMASK;
3006 if (info == XLOG_HEAP_INSERT)
3007 heap_xlog_insert(false, lsn, record);
3008 else if (info == XLOG_HEAP_DELETE)
3009 heap_xlog_delete(false, lsn, record);
3010 else if (info == XLOG_HEAP_UPDATE)
3011 heap_xlog_update(false, lsn, record, false);
3012 else if (info == XLOG_HEAP_MOVE)
3013 heap_xlog_update(false, lsn, record, true);
3014 else if (info == XLOG_HEAP_CLEAN)
3015 heap_xlog_clean(false, lsn, record);
3016 else if (info == XLOG_HEAP_NEWPAGE)
3017 heap_xlog_newpage(false, lsn, record);
3018 else if (info == XLOG_HEAP_LOCK)
3019 heap_xlog_lock(false, lsn, record);
3021 elog(PANIC, "heap_undo: unknown op code %u", info);
3025 out_target(char *buf, xl_heaptid *target)
3027 sprintf(buf + strlen(buf), "rel %u/%u/%u; tid %u/%u",
3028 target->node.spcNode, target->node.dbNode, target->node.relNode,
3029 ItemPointerGetBlockNumber(&(target->tid)),
3030 ItemPointerGetOffsetNumber(&(target->tid)));
3034 heap_desc(char *buf, uint8 xl_info, char *rec)
3036 uint8 info = xl_info & ~XLR_INFO_MASK;
3038 info &= XLOG_HEAP_OPMASK;
3039 if (info == XLOG_HEAP_INSERT)
3041 xl_heap_insert *xlrec = (xl_heap_insert *) rec;
3043 strcat(buf, "insert: ");
3044 out_target(buf, &(xlrec->target));
3046 else if (info == XLOG_HEAP_DELETE)
3048 xl_heap_delete *xlrec = (xl_heap_delete *) rec;
3050 strcat(buf, "delete: ");
3051 out_target(buf, &(xlrec->target));
3053 else if (info == XLOG_HEAP_UPDATE || info == XLOG_HEAP_MOVE)
3055 xl_heap_update *xlrec = (xl_heap_update *) rec;
3057 if (info == XLOG_HEAP_UPDATE)
3058 strcat(buf, "update: ");
3060 strcat(buf, "move: ");
3061 out_target(buf, &(xlrec->target));
3062 sprintf(buf + strlen(buf), "; new %u/%u",
3063 ItemPointerGetBlockNumber(&(xlrec->newtid)),
3064 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
3066 else if (info == XLOG_HEAP_CLEAN)
3068 xl_heap_clean *xlrec = (xl_heap_clean *) rec;
3070 sprintf(buf + strlen(buf), "clean: rel %u/%u/%u; blk %u",
3071 xlrec->node.spcNode, xlrec->node.dbNode,
3072 xlrec->node.relNode, xlrec->block);
3074 else if (info == XLOG_HEAP_NEWPAGE)
3076 xl_heap_newpage *xlrec = (xl_heap_newpage *) rec;
3078 sprintf(buf + strlen(buf), "newpage: rel %u/%u/%u; blk %u",
3079 xlrec->node.spcNode, xlrec->node.dbNode,
3080 xlrec->node.relNode, xlrec->blkno);
3082 else if (info == XLOG_HEAP_LOCK)
3084 xl_heap_lock *xlrec = (xl_heap_lock *) rec;
3086 if (xlrec->shared_lock)
3087 strcat(buf, "shared_lock: ");
3089 strcat(buf, "exclusive_lock: ");
3090 out_target(buf, &(xlrec->target));
3093 strcat(buf, "UNKNOWN");