]> granicus.if.org Git - postgresql/blob - src/backend/access/heap/pruneheap.c
Update copyrights for 2013
[postgresql] / src / backend / access / heap / pruneheap.c
1 /*-------------------------------------------------------------------------
2  *
3  * pruneheap.c
4  *        heap page pruning and HOT-chain management code
5  *
6  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/access/heap/pruneheap.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include "access/heapam.h"
18 #include "access/heapam_xlog.h"
19 #include "access/transam.h"
20 #include "access/htup_details.h"
21 #include "miscadmin.h"
22 #include "pgstat.h"
23 #include "storage/bufmgr.h"
24 #include "utils/rel.h"
25 #include "utils/tqual.h"
26
27
28 /* Working data for heap_page_prune and subroutines */
29 typedef struct
30 {
31         TransactionId new_prune_xid;    /* new prune hint value for page */
32         TransactionId latestRemovedXid;         /* latest xid to be removed by this
33                                                                                  * prune */
34         int                     nredirected;    /* numbers of entries in arrays below */
35         int                     ndead;
36         int                     nunused;
37         /* arrays that accumulate indexes of items to be changed */
38         OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
39         OffsetNumber nowdead[MaxHeapTuplesPerPage];
40         OffsetNumber nowunused[MaxHeapTuplesPerPage];
41         /* marked[i] is TRUE if item i is entered in one of the above arrays */
42         bool            marked[MaxHeapTuplesPerPage + 1];
43 } PruneState;
44
45 /* Local functions */
46 static int heap_prune_chain(Relation relation, Buffer buffer,
47                                  OffsetNumber rootoffnum,
48                                  TransactionId OldestXmin,
49                                  PruneState *prstate);
50 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
51 static void heap_prune_record_redirect(PruneState *prstate,
52                                                    OffsetNumber offnum, OffsetNumber rdoffnum);
53 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
54 static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
55
56
57 /*
58  * Optionally prune and repair fragmentation in the specified page.
59  *
60  * This is an opportunistic function.  It will perform housekeeping
61  * only if the page heuristically looks like a candidate for pruning and we
62  * can acquire buffer cleanup lock without blocking.
63  *
64  * Note: this is called quite often.  It's important that it fall out quickly
65  * if there's not any use in pruning.
66  *
67  * Caller must have pin on the buffer, and must *not* have a lock on it.
68  *
69  * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
70  * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
71  */
72 void
73 heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
74 {
75         Page            page = BufferGetPage(buffer);
76         Size            minfree;
77
78         /*
79          * Let's see if we really need pruning.
80          *
81          * Forget it if page is not hinted to contain something prunable that's
82          * older than OldestXmin.
83          */
84         if (!PageIsPrunable(page, OldestXmin))
85                 return;
86
87         /*
88          * We can't write WAL in recovery mode, so there's no point trying to
89          * clean the page. The master will likely issue a cleaning WAL record soon
90          * anyway, so this is no particular loss.
91          */
92         if (RecoveryInProgress())
93                 return;
94
95         /*
96          * We prune when a previous UPDATE failed to find enough space on the page
97          * for a new tuple version, or when free space falls below the relation's
98          * fill-factor target (but not less than 10%).
99          *
100          * Checking free space here is questionable since we aren't holding any
101          * lock on the buffer; in the worst case we could get a bogus answer. It's
102          * unlikely to be *seriously* wrong, though, since reading either pd_lower
103          * or pd_upper is probably atomic.      Avoiding taking a lock seems more
104          * important than sometimes getting a wrong answer in what is after all
105          * just a heuristic estimate.
106          */
107         minfree = RelationGetTargetPageFreeSpace(relation,
108                                                                                          HEAP_DEFAULT_FILLFACTOR);
109         minfree = Max(minfree, BLCKSZ / 10);
110
111         if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
112         {
113                 /* OK, try to get exclusive buffer lock */
114                 if (!ConditionalLockBufferForCleanup(buffer))
115                         return;
116
117                 /*
118                  * Now that we have buffer lock, get accurate information about the
119                  * page's free space, and recheck the heuristic about whether to
120                  * prune. (We needn't recheck PageIsPrunable, since no one else could
121                  * have pruned while we hold pin.)
122                  */
123                 if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
124                 {
125                         TransactionId ignore = InvalidTransactionId;            /* return value not
126                                                                                                                                  * needed */
127
128                         /* OK to prune */
129                         (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore);
130                 }
131
132                 /* And release buffer lock */
133                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
134         }
135 }
136
137
138 /*
139  * Prune and repair fragmentation in the specified page.
140  *
141  * Caller must have pin and buffer cleanup lock on the page.
142  *
143  * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
144  * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
145  *
146  * If report_stats is true then we send the number of reclaimed heap-only
147  * tuples to pgstats.  (This must be FALSE during vacuum, since vacuum will
148  * send its own new total to pgstats, and we don't want this delta applied
149  * on top of that.)
150  *
151  * Returns the number of tuples deleted from the page and sets
152  * latestRemovedXid.
153  */
154 int
155 heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
156                                 bool report_stats, TransactionId *latestRemovedXid)
157 {
158         int                     ndeleted = 0;
159         Page            page = BufferGetPage(buffer);
160         OffsetNumber offnum,
161                                 maxoff;
162         PruneState      prstate;
163
164         /*
165          * Our strategy is to scan the page and make lists of items to change,
166          * then apply the changes within a critical section.  This keeps as much
167          * logic as possible out of the critical section, and also ensures that
168          * WAL replay will work the same as the normal case.
169          *
170          * First, initialize the new pd_prune_xid value to zero (indicating no
171          * prunable tuples).  If we find any tuples which may soon become
172          * prunable, we will save the lowest relevant XID in new_prune_xid. Also
173          * initialize the rest of our working state.
174          */
175         prstate.new_prune_xid = InvalidTransactionId;
176         prstate.latestRemovedXid = InvalidTransactionId;
177         prstate.nredirected = prstate.ndead = prstate.nunused = 0;
178         memset(prstate.marked, 0, sizeof(prstate.marked));
179
180         /* Scan the page */
181         maxoff = PageGetMaxOffsetNumber(page);
182         for (offnum = FirstOffsetNumber;
183                  offnum <= maxoff;
184                  offnum = OffsetNumberNext(offnum))
185         {
186                 ItemId          itemid;
187
188                 /* Ignore items already processed as part of an earlier chain */
189                 if (prstate.marked[offnum])
190                         continue;
191
192                 /* Nothing to do if slot is empty or already dead */
193                 itemid = PageGetItemId(page, offnum);
194                 if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
195                         continue;
196
197                 /* Process this item or chain of items */
198                 ndeleted += heap_prune_chain(relation, buffer, offnum,
199                                                                          OldestXmin,
200                                                                          &prstate);
201         }
202
203         /* Any error while applying the changes is critical */
204         START_CRIT_SECTION();
205
206         /* Have we found any prunable items? */
207         if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
208         {
209                 /*
210                  * Apply the planned item changes, then repair page fragmentation, and
211                  * update the page's hint bit about whether it has free line pointers.
212                  */
213                 heap_page_prune_execute(buffer,
214                                                                 prstate.redirected, prstate.nredirected,
215                                                                 prstate.nowdead, prstate.ndead,
216                                                                 prstate.nowunused, prstate.nunused);
217
218                 /*
219                  * Update the page's pd_prune_xid field to either zero, or the lowest
220                  * XID of any soon-prunable tuple.
221                  */
222                 ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
223
224                 /*
225                  * Also clear the "page is full" flag, since there's no point in
226                  * repeating the prune/defrag process until something else happens to
227                  * the page.
228                  */
229                 PageClearFull(page);
230
231                 MarkBufferDirty(buffer);
232
233                 /*
234                  * Emit a WAL HEAP_CLEAN record showing what we did
235                  */
236                 if (RelationNeedsWAL(relation))
237                 {
238                         XLogRecPtr      recptr;
239
240                         recptr = log_heap_clean(relation, buffer,
241                                                                         prstate.redirected, prstate.nredirected,
242                                                                         prstate.nowdead, prstate.ndead,
243                                                                         prstate.nowunused, prstate.nunused,
244                                                                         prstate.latestRemovedXid);
245
246                         PageSetLSN(BufferGetPage(buffer), recptr);
247                         PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
248                 }
249         }
250         else
251         {
252                 /*
253                  * If we didn't prune anything, but have found a new value for the
254                  * pd_prune_xid field, update it and mark the buffer dirty. This is
255                  * treated as a non-WAL-logged hint.
256                  *
257                  * Also clear the "page is full" flag if it is set, since there's no
258                  * point in repeating the prune/defrag process until something else
259                  * happens to the page.
260                  */
261                 if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
262                         PageIsFull(page))
263                 {
264                         ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
265                         PageClearFull(page);
266                         SetBufferCommitInfoNeedsSave(buffer);
267                 }
268         }
269
270         END_CRIT_SECTION();
271
272         /*
273          * If requested, report the number of tuples reclaimed to pgstats. This is
274          * ndeleted minus ndead, because we don't want to count a now-DEAD root
275          * item as a deletion for this purpose.
276          */
277         if (report_stats && ndeleted > prstate.ndead)
278                 pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);
279
280         *latestRemovedXid = prstate.latestRemovedXid;
281
282         /*
283          * XXX Should we update the FSM information of this page ?
284          *
285          * There are two schools of thought here. We may not want to update FSM
286          * information so that the page is not used for unrelated UPDATEs/INSERTs
287          * and any free space in this page will remain available for further
288          * UPDATEs in *this* page, thus improving chances for doing HOT updates.
289          *
290          * But for a large table and where a page does not receive further UPDATEs
291          * for a long time, we might waste this space by not updating the FSM
292          * information. The relation may get extended and fragmented further.
293          *
294          * One possibility is to leave "fillfactor" worth of space in this page
295          * and update FSM with the remaining space.
296          */
297
298         return ndeleted;
299 }
300
301
302 /*
303  * Prune specified item pointer or a HOT chain originating at that item.
304  *
305  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
306  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
307  * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
308  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
309  * DEAD, the OldestXmin test is just too coarse to detect it.
310  *
311  * The root line pointer is redirected to the tuple immediately after the
312  * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
313  * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
314  * tuple, which we treat as a chain of length 1.)
315  *
316  * OldestXmin is the cutoff XID used to identify dead tuples.
317  *
318  * We don't actually change the page here, except perhaps for hint-bit updates
319  * caused by HeapTupleSatisfiesVacuum.  We just add entries to the arrays in
320  * prstate showing the changes to be made.      Items to be redirected are added
321  * to the redirected[] array (two entries per redirection); items to be set to
322  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
323  * state are added to nowunused[].
324  *
325  * Returns the number of tuples (to be) deleted from the page.
326  */
327 static int
328 heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
329                                  TransactionId OldestXmin,
330                                  PruneState *prstate)
331 {
332         int                     ndeleted = 0;
333         Page            dp = (Page) BufferGetPage(buffer);
334         TransactionId priorXmax = InvalidTransactionId;
335         ItemId          rootlp;
336         HeapTupleHeader htup;
337         OffsetNumber latestdead = InvalidOffsetNumber,
338                                 maxoff = PageGetMaxOffsetNumber(dp),
339                                 offnum;
340         OffsetNumber chainitems[MaxHeapTuplesPerPage];
341         int                     nchain = 0,
342                                 i;
343
344         rootlp = PageGetItemId(dp, rootoffnum);
345
346         /*
347          * If it's a heap-only tuple, then it is not the start of a HOT chain.
348          */
349         if (ItemIdIsNormal(rootlp))
350         {
351                 htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
352                 if (HeapTupleHeaderIsHeapOnly(htup))
353                 {
354                         /*
355                          * If the tuple is DEAD and doesn't chain to anything else, mark
356                          * it unused immediately.  (If it does chain, we can only remove
357                          * it as part of pruning its chain.)
358                          *
359                          * We need this primarily to handle aborted HOT updates, that is,
360                          * XMIN_INVALID heap-only tuples.  Those might not be linked to by
361                          * any chain, since the parent tuple might be re-updated before
362                          * any pruning occurs.  So we have to be able to reap them
363                          * separately from chain-pruning.  (Note that
364                          * HeapTupleHeaderIsHotUpdated will never return true for an
365                          * XMIN_INVALID tuple, so this code will work even when there were
366                          * sequential updates within the aborted transaction.)
367                          *
368                          * Note that we might first arrive at a dead heap-only tuple
369                          * either here or while following a chain below.  Whichever path
370                          * gets there first will mark the tuple unused.
371                          */
372                         if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
373                                 == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
374                         {
375                                 heap_prune_record_unused(prstate, rootoffnum);
376                                 HeapTupleHeaderAdvanceLatestRemovedXid(htup,
377                                                                                                  &prstate->latestRemovedXid);
378                                 ndeleted++;
379                         }
380
381                         /* Nothing more to do */
382                         return ndeleted;
383                 }
384         }
385
386         /* Start from the root tuple */
387         offnum = rootoffnum;
388
389         /* while not end of the chain */
390         for (;;)
391         {
392                 ItemId          lp;
393                 bool            tupdead,
394                                         recent_dead;
395
396                 /* Some sanity checks */
397                 if (offnum < FirstOffsetNumber || offnum > maxoff)
398                         break;
399
400                 /* If item is already processed, stop --- it must not be same chain */
401                 if (prstate->marked[offnum])
402                         break;
403
404                 lp = PageGetItemId(dp, offnum);
405
406                 /* Unused item obviously isn't part of the chain */
407                 if (!ItemIdIsUsed(lp))
408                         break;
409
410                 /*
411                  * If we are looking at the redirected root line pointer, jump to the
412                  * first normal tuple in the chain.  If we find a redirect somewhere
413                  * else, stop --- it must not be same chain.
414                  */
415                 if (ItemIdIsRedirected(lp))
416                 {
417                         if (nchain > 0)
418                                 break;                  /* not at start of chain */
419                         chainitems[nchain++] = offnum;
420                         offnum = ItemIdGetRedirect(rootlp);
421                         continue;
422                 }
423
424                 /*
425                  * Likewise, a dead item pointer can't be part of the chain. (We
426                  * already eliminated the case of dead root tuple outside this
427                  * function.)
428                  */
429                 if (ItemIdIsDead(lp))
430                         break;
431
432                 Assert(ItemIdIsNormal(lp));
433                 htup = (HeapTupleHeader) PageGetItem(dp, lp);
434
435                 /*
436                  * Check the tuple XMIN against prior XMAX, if any
437                  */
438                 if (TransactionIdIsValid(priorXmax) &&
439                         !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
440                         break;
441
442                 /*
443                  * OK, this tuple is indeed a member of the chain.
444                  */
445                 chainitems[nchain++] = offnum;
446
447                 /*
448                  * Check tuple's visibility status.
449                  */
450                 tupdead = recent_dead = false;
451
452                 switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer))
453                 {
454                         case HEAPTUPLE_DEAD:
455                                 tupdead = true;
456                                 break;
457
458                         case HEAPTUPLE_RECENTLY_DEAD:
459                                 recent_dead = true;
460
461                                 /*
462                                  * This tuple may soon become DEAD.  Update the hint field so
463                                  * that the page is reconsidered for pruning in future.
464                                  */
465                                 heap_prune_record_prunable(prstate,
466                                                                                    HeapTupleHeaderGetXmax(htup));
467                                 break;
468
469                         case HEAPTUPLE_DELETE_IN_PROGRESS:
470
471                                 /*
472                                  * This tuple may soon become DEAD.  Update the hint field so
473                                  * that the page is reconsidered for pruning in future.
474                                  */
475                                 heap_prune_record_prunable(prstate,
476                                                                                    HeapTupleHeaderGetXmax(htup));
477                                 break;
478
479                         case HEAPTUPLE_LIVE:
480                         case HEAPTUPLE_INSERT_IN_PROGRESS:
481
482                                 /*
483                                  * If we wanted to optimize for aborts, we might consider
484                                  * marking the page prunable when we see INSERT_IN_PROGRESS.
485                                  * But we don't.  See related decisions about when to mark the
486                                  * page prunable in heapam.c.
487                                  */
488                                 break;
489
490                         default:
491                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
492                                 break;
493                 }
494
495                 /*
496                  * Remember the last DEAD tuple seen.  We will advance past
497                  * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
498                  * but we can't advance past anything else.  (XXX is it really worth
499                  * continuing to scan beyond RECENTLY_DEAD?  The case where we will
500                  * find another DEAD tuple is a fairly unusual corner case.)
501                  */
502                 if (tupdead)
503                 {
504                         latestdead = offnum;
505                         HeapTupleHeaderAdvanceLatestRemovedXid(htup,
506                                                                                                  &prstate->latestRemovedXid);
507                 }
508                 else if (!recent_dead)
509                         break;
510
511                 /*
512                  * If the tuple is not HOT-updated, then we are at the end of this
513                  * HOT-update chain.
514                  */
515                 if (!HeapTupleHeaderIsHotUpdated(htup))
516                         break;
517
518                 /*
519                  * Advance to next chain member.
520                  */
521                 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
522                            BufferGetBlockNumber(buffer));
523                 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
524                 priorXmax = HeapTupleHeaderGetXmax(htup);
525         }
526
527         /*
528          * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
529          * the DEAD tuples at the start of the chain are removed and the root line
530          * pointer is appropriately redirected.
531          */
532         if (OffsetNumberIsValid(latestdead))
533         {
534                 /*
535                  * Mark as unused each intermediate item that we are able to remove
536                  * from the chain.
537                  *
538                  * When the previous item is the last dead tuple seen, we are at the
539                  * right candidate for redirection.
540                  */
541                 for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
542                 {
543                         heap_prune_record_unused(prstate, chainitems[i]);
544                         ndeleted++;
545                 }
546
547                 /*
548                  * If the root entry had been a normal tuple, we are deleting it, so
549                  * count it in the result.      But changing a redirect (even to DEAD
550                  * state) doesn't count.
551                  */
552                 if (ItemIdIsNormal(rootlp))
553                         ndeleted++;
554
555                 /*
556                  * If the DEAD tuple is at the end of the chain, the entire chain is
557                  * dead and the root line pointer can be marked dead.  Otherwise just
558                  * redirect the root to the correct chain member.
559                  */
560                 if (i >= nchain)
561                         heap_prune_record_dead(prstate, rootoffnum);
562                 else
563                         heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
564         }
565         else if (nchain < 2 && ItemIdIsRedirected(rootlp))
566         {
567                 /*
568                  * We found a redirect item that doesn't point to a valid follow-on
569                  * item.  This can happen if the loop in heap_page_prune caused us to
570                  * visit the dead successor of a redirect item before visiting the
571                  * redirect item.  We can clean up by setting the redirect item to
572                  * DEAD state.
573                  */
574                 heap_prune_record_dead(prstate, rootoffnum);
575         }
576
577         return ndeleted;
578 }
579
580 /* Record lowest soon-prunable XID */
581 static void
582 heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
583 {
584         /*
585          * This should exactly match the PageSetPrunable macro.  We can't store
586          * directly into the page header yet, so we update working state.
587          */
588         Assert(TransactionIdIsNormal(xid));
589         if (!TransactionIdIsValid(prstate->new_prune_xid) ||
590                 TransactionIdPrecedes(xid, prstate->new_prune_xid))
591                 prstate->new_prune_xid = xid;
592 }
593
594 /* Record item pointer to be redirected */
595 static void
596 heap_prune_record_redirect(PruneState *prstate,
597                                                    OffsetNumber offnum, OffsetNumber rdoffnum)
598 {
599         Assert(prstate->nredirected < MaxHeapTuplesPerPage);
600         prstate->redirected[prstate->nredirected * 2] = offnum;
601         prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
602         prstate->nredirected++;
603         Assert(!prstate->marked[offnum]);
604         prstate->marked[offnum] = true;
605         Assert(!prstate->marked[rdoffnum]);
606         prstate->marked[rdoffnum] = true;
607 }
608
609 /* Record item pointer to be marked dead */
610 static void
611 heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
612 {
613         Assert(prstate->ndead < MaxHeapTuplesPerPage);
614         prstate->nowdead[prstate->ndead] = offnum;
615         prstate->ndead++;
616         Assert(!prstate->marked[offnum]);
617         prstate->marked[offnum] = true;
618 }
619
620 /* Record item pointer to be marked unused */
621 static void
622 heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
623 {
624         Assert(prstate->nunused < MaxHeapTuplesPerPage);
625         prstate->nowunused[prstate->nunused] = offnum;
626         prstate->nunused++;
627         Assert(!prstate->marked[offnum]);
628         prstate->marked[offnum] = true;
629 }
630
631
632 /*
633  * Perform the actual page changes needed by heap_page_prune.
634  * It is expected that the caller has suitable pin and lock on the
635  * buffer, and is inside a critical section.
636  *
637  * This is split out because it is also used by heap_xlog_clean()
638  * to replay the WAL record when needed after a crash.  Note that the
639  * arguments are identical to those of log_heap_clean().
640  */
641 void
642 heap_page_prune_execute(Buffer buffer,
643                                                 OffsetNumber *redirected, int nredirected,
644                                                 OffsetNumber *nowdead, int ndead,
645                                                 OffsetNumber *nowunused, int nunused)
646 {
647         Page            page = (Page) BufferGetPage(buffer);
648         OffsetNumber *offnum;
649         int                     i;
650
651         /* Update all redirected line pointers */
652         offnum = redirected;
653         for (i = 0; i < nredirected; i++)
654         {
655                 OffsetNumber fromoff = *offnum++;
656                 OffsetNumber tooff = *offnum++;
657                 ItemId          fromlp = PageGetItemId(page, fromoff);
658
659                 ItemIdSetRedirect(fromlp, tooff);
660         }
661
662         /* Update all now-dead line pointers */
663         offnum = nowdead;
664         for (i = 0; i < ndead; i++)
665         {
666                 OffsetNumber off = *offnum++;
667                 ItemId          lp = PageGetItemId(page, off);
668
669                 ItemIdSetDead(lp);
670         }
671
672         /* Update all now-unused line pointers */
673         offnum = nowunused;
674         for (i = 0; i < nunused; i++)
675         {
676                 OffsetNumber off = *offnum++;
677                 ItemId          lp = PageGetItemId(page, off);
678
679                 ItemIdSetUnused(lp);
680         }
681
682         /*
683          * Finally, repair any fragmentation, and update the page's hint bit about
684          * whether it has free pointers.
685          */
686         PageRepairFragmentation(page);
687 }
688
689
690 /*
691  * For all items in this page, find their respective root line pointers.
692  * If item k is part of a HOT-chain with root at item j, then we set
693  * root_offsets[k - 1] = j.
694  *
695  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
696  * We zero out all unused entries.
697  *
698  * The function must be called with at least share lock on the buffer, to
699  * prevent concurrent prune operations.
700  *
701  * Note: The information collected here is valid only as long as the caller
702  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
703  * and reused by a completely unrelated tuple.
704  */
705 void
706 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
707 {
708         OffsetNumber offnum,
709                                 maxoff;
710
711         MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
712
713         maxoff = PageGetMaxOffsetNumber(page);
714         for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
715         {
716                 ItemId          lp = PageGetItemId(page, offnum);
717                 HeapTupleHeader htup;
718                 OffsetNumber nextoffnum;
719                 TransactionId priorXmax;
720
721                 /* skip unused and dead items */
722                 if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
723                         continue;
724
725                 if (ItemIdIsNormal(lp))
726                 {
727                         htup = (HeapTupleHeader) PageGetItem(page, lp);
728
729                         /*
730                          * Check if this tuple is part of a HOT-chain rooted at some other
731                          * tuple. If so, skip it for now; we'll process it when we find
732                          * its root.
733                          */
734                         if (HeapTupleHeaderIsHeapOnly(htup))
735                                 continue;
736
737                         /*
738                          * This is either a plain tuple or the root of a HOT-chain.
739                          * Remember it in the mapping.
740                          */
741                         root_offsets[offnum - 1] = offnum;
742
743                         /* If it's not the start of a HOT-chain, we're done with it */
744                         if (!HeapTupleHeaderIsHotUpdated(htup))
745                                 continue;
746
747                         /* Set up to scan the HOT-chain */
748                         nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
749                         priorXmax = HeapTupleHeaderGetXmax(htup);
750                 }
751                 else
752                 {
753                         /* Must be a redirect item. We do not set its root_offsets entry */
754                         Assert(ItemIdIsRedirected(lp));
755                         /* Set up to scan the HOT-chain */
756                         nextoffnum = ItemIdGetRedirect(lp);
757                         priorXmax = InvalidTransactionId;
758                 }
759
760                 /*
761                  * Now follow the HOT-chain and collect other tuples in the chain.
762                  *
763                  * Note: Even though this is a nested loop, the complexity of the
764                  * function is O(N) because a tuple in the page should be visited not
765                  * more than twice, once in the outer loop and once in HOT-chain
766                  * chases.
767                  */
768                 for (;;)
769                 {
770                         lp = PageGetItemId(page, nextoffnum);
771
772                         /* Check for broken chains */
773                         if (!ItemIdIsNormal(lp))
774                                 break;
775
776                         htup = (HeapTupleHeader) PageGetItem(page, lp);
777
778                         if (TransactionIdIsValid(priorXmax) &&
779                                 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
780                                 break;
781
782                         /* Remember the root line pointer for this item */
783                         root_offsets[nextoffnum - 1] = offnum;
784
785                         /* Advance to next chain member, if any */
786                         if (!HeapTupleHeaderIsHotUpdated(htup))
787                                 break;
788
789                         nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
790                         priorXmax = HeapTupleHeaderGetXmax(htup);
791                 }
792         }
793 }