]> granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c
4f6f6e7782d6a632fb37031fca5404e8cc9b1bc6
[postgresql] / src / backend / commands / vacuumlazy.c
1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  *        Concurrent ("lazy") vacuuming.
5  *
6  *
7  * The major space usage for LAZY VACUUM is storage for the array of dead
8  * tuple TIDs, with the next biggest need being storage for per-disk-page
9  * free space info.  We want to ensure we can vacuum even the very largest
10  * relations with finite memory space usage.  To do that, we set upper bounds
11  * on the number of tuples and pages we will keep track of at once.
12  *
13  * We are willing to use at most maintenance_work_mem (or perhaps
14  * autovacuum_work_mem) memory space to keep track of dead tuples.  We
15  * initially allocate an array of TIDs of that size, with an upper limit that
16  * depends on table size (this limit ensures we don't allocate a huge area
17  * uselessly for vacuuming small tables).  If the array threatens to overflow,
18  * we suspend the heap scan phase and perform a pass of index cleanup and page
19  * compaction, then resume the heap scan with an empty TID array.
20  *
21  * If we're processing a table with no indexes, we can just vacuum each page
22  * as we go; there's no need to save up multiple tuples to minimize the number
23  * of index scans performed.  So we don't use maintenance_work_mem memory for
24  * the TID array, just enough to hold as many heap tuples as fit on one page.
25  *
26  *
27  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
28  * Portions Copyright (c) 1994, Regents of the University of California
29  *
30  *
31  * IDENTIFICATION
32  *        src/backend/commands/vacuumlazy.c
33  *
34  *-------------------------------------------------------------------------
35  */
36 #include "postgres.h"
37
38 #include <math.h>
39
40 #include "access/genam.h"
41 #include "access/heapam.h"
42 #include "access/heapam_xlog.h"
43 #include "access/htup_details.h"
44 #include "access/multixact.h"
45 #include "access/transam.h"
46 #include "access/visibilitymap.h"
47 #include "access/xlog.h"
48 #include "catalog/catalog.h"
49 #include "catalog/storage.h"
50 #include "commands/dbcommands.h"
51 #include "commands/vacuum.h"
52 #include "miscadmin.h"
53 #include "pgstat.h"
54 #include "portability/instr_time.h"
55 #include "postmaster/autovacuum.h"
56 #include "storage/bufmgr.h"
57 #include "storage/freespace.h"
58 #include "storage/lmgr.h"
59 #include "utils/lsyscache.h"
60 #include "utils/memutils.h"
61 #include "utils/pg_rusage.h"
62 #include "utils/timestamp.h"
63 #include "utils/tqual.h"
64
65
66 /*
67  * Space/time tradeoff parameters: do these need to be user-tunable?
68  *
69  * To consider truncating the relation, we want there to be at least
70  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
71  * is less) potentially-freeable pages.
72  */
73 #define REL_TRUNCATE_MINIMUM    1000
74 #define REL_TRUNCATE_FRACTION   16
75
76 /*
77  * Timing parameters for truncate locking heuristics.
78  *
79  * These were not exposed as user tunable GUC values because it didn't seem
80  * that the potential for improvement was great enough to merit the cost of
81  * supporting them.
82  */
83 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL             20              /* ms */
84 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL              50              /* ms */
85 #define VACUUM_TRUNCATE_LOCK_TIMEOUT                    5000    /* ms */
86
87 /*
88  * Guesstimation of number of dead tuples per page.  This is used to
89  * provide an upper limit to memory allocated when vacuuming small
90  * tables.
91  */
92 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
93
94 /*
95  * Before we consider skipping a page that's marked as clean in
96  * visibility map, we must've seen at least this many clean pages.
97  */
98 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
99
100 typedef struct LVRelStats
101 {
102         /* hasindex = true means two-pass strategy; false means one-pass */
103         bool            hasindex;
104         /* Overall statistics about rel */
105         BlockNumber old_rel_pages;      /* previous value of pg_class.relpages */
106         BlockNumber rel_pages;          /* total number of pages */
107         BlockNumber scanned_pages;      /* number of pages we examined */
108         BlockNumber pinskipped_pages;           /* # of pages we skipped due to a pin */
109         double          scanned_tuples; /* counts only tuples on scanned pages */
110         double          old_rel_tuples; /* previous value of pg_class.reltuples */
111         double          new_rel_tuples; /* new estimated total # of tuples */
112         double          new_dead_tuples;        /* new estimated total # of dead tuples */
113         BlockNumber pages_removed;
114         double          tuples_deleted;
115         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
116         /* List of TIDs of tuples we intend to delete */
117         /* NB: this list is ordered by TID address */
118         int                     num_dead_tuples;        /* current # of entries */
119         int                     max_dead_tuples;        /* # slots allocated in array */
120         ItemPointer dead_tuples;        /* array of ItemPointerData */
121         int                     num_index_scans;
122         TransactionId latestRemovedXid;
123         bool            lock_waiter_detected;
124 } LVRelStats;
125
126
127 /* A few variables that don't seem worth passing around as parameters */
128 static int      elevel = -1;
129
130 static TransactionId OldestXmin;
131 static TransactionId FreezeLimit;
132 static MultiXactId MultiXactCutoff;
133
134 static BufferAccessStrategy vac_strategy;
135
136
137 /* non-export function prototypes */
138 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
139                            Relation *Irel, int nindexes, bool scan_all);
140 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
141 static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
142 static void lazy_vacuum_index(Relation indrel,
143                                   IndexBulkDeleteResult **stats,
144                                   LVRelStats *vacrelstats);
145 static void lazy_cleanup_index(Relation indrel,
146                                    IndexBulkDeleteResult *stats,
147                                    LVRelStats *vacrelstats);
148 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
149                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
150 static bool should_attempt_truncation(LVRelStats *vacrelstats);
151 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
152 static BlockNumber count_nondeletable_pages(Relation onerel,
153                                                  LVRelStats *vacrelstats);
154 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
155 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
156                                            ItemPointer itemptr);
157 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
158 static int      vac_cmp_itemptr(const void *left, const void *right);
159 static bool heap_page_is_all_visible(Relation rel, Buffer buf,
160                                                  TransactionId *visibility_cutoff_xid);
161
162
163 /*
164  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
165  *
166  *              This routine vacuums a single heap, cleans out its indexes, and
167  *              updates its relpages and reltuples statistics.
168  *
169  *              At entry, we have already established a transaction and opened
170  *              and locked the relation.
171  */
172 void
173 lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
174                                 BufferAccessStrategy bstrategy)
175 {
176         LVRelStats *vacrelstats;
177         Relation   *Irel;
178         int                     nindexes;
179         PGRUsage        ru0;
180         TimestampTz starttime = 0;
181         long            secs;
182         int                     usecs;
183         double          read_rate,
184                                 write_rate;
185         bool            scan_all;               /* should we scan all pages? */
186         bool            scanned_all;    /* did we actually scan all pages? */
187         TransactionId xidFullScanLimit;
188         MultiXactId mxactFullScanLimit;
189         BlockNumber new_rel_pages;
190         double          new_rel_tuples;
191         BlockNumber new_rel_allvisible;
192         double          new_live_tuples;
193         TransactionId new_frozen_xid;
194         MultiXactId new_min_multi;
195
196         Assert(params != NULL);
197
198         /* measure elapsed time iff autovacuum logging requires it */
199         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
200         {
201                 pg_rusage_init(&ru0);
202                 starttime = GetCurrentTimestamp();
203         }
204
205         if (options & VACOPT_VERBOSE)
206                 elevel = INFO;
207         else
208                 elevel = DEBUG2;
209
210         vac_strategy = bstrategy;
211
212         vacuum_set_xid_limits(onerel,
213                                                   params->freeze_min_age,
214                                                   params->freeze_table_age,
215                                                   params->multixact_freeze_min_age,
216                                                   params->multixact_freeze_table_age,
217                                                   &OldestXmin, &FreezeLimit, &xidFullScanLimit,
218                                                   &MultiXactCutoff, &mxactFullScanLimit);
219
220         /*
221          * We request a full scan if either the table's frozen Xid is now older
222          * than or equal to the requested Xid full-table scan limit; or if the
223          * table's minimum MultiXactId is older than or equal to the requested
224          * mxid full-table scan limit.
225          */
226         scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
227                                                                                          xidFullScanLimit);
228         scan_all |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid,
229                                                                                         mxactFullScanLimit);
230
231         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
232
233         vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
234         vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
235         vacrelstats->num_index_scans = 0;
236         vacrelstats->pages_removed = 0;
237         vacrelstats->lock_waiter_detected = false;
238
239         /* Open all indexes of the relation */
240         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
241         vacrelstats->hasindex = (nindexes > 0);
242
243         /* Do the vacuuming */
244         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
245
246         /* Done with indexes */
247         vac_close_indexes(nindexes, Irel, NoLock);
248
249         /*
250          * Compute whether we actually scanned the whole relation. If we did, we
251          * can adjust relfrozenxid and relminmxid.
252          *
253          * NB: We need to check this before truncating the relation, because that
254          * will change ->rel_pages.
255          */
256         if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
257         {
258                 Assert(!scan_all);
259                 scanned_all = false;
260         }
261         else
262                 scanned_all = true;
263
264         /*
265          * Optionally truncate the relation.
266          */
267         if (should_attempt_truncation(vacrelstats))
268                 lazy_truncate_heap(onerel, vacrelstats);
269
270         /* Vacuum the Free Space Map */
271         FreeSpaceMapVacuum(onerel);
272
273         /*
274          * Update statistics in pg_class.
275          *
276          * A corner case here is that if we scanned no pages at all because every
277          * page is all-visible, we should not update relpages/reltuples, because
278          * we have no new information to contribute.  In particular this keeps us
279          * from replacing relpages=reltuples=0 (which means "unknown tuple
280          * density") with nonzero relpages and reltuples=0 (which means "zero
281          * tuple density") unless there's some actual evidence for the latter.
282          *
283          * We do update relallvisible even in the corner case, since if the table
284          * is all-visible we'd definitely like to know that.  But clamp the value
285          * to be not more than what we're setting relpages to.
286          *
287          * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
288          * since then we don't know for certain that all tuples have a newer xmin.
289          */
290         new_rel_pages = vacrelstats->rel_pages;
291         new_rel_tuples = vacrelstats->new_rel_tuples;
292         if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0)
293         {
294                 new_rel_pages = vacrelstats->old_rel_pages;
295                 new_rel_tuples = vacrelstats->old_rel_tuples;
296         }
297
298         new_rel_allvisible = visibilitymap_count(onerel);
299         if (new_rel_allvisible > new_rel_pages)
300                 new_rel_allvisible = new_rel_pages;
301
302         new_frozen_xid = scanned_all ? FreezeLimit : InvalidTransactionId;
303         new_min_multi = scanned_all ? MultiXactCutoff : InvalidMultiXactId;
304
305         vac_update_relstats(onerel,
306                                                 new_rel_pages,
307                                                 new_rel_tuples,
308                                                 new_rel_allvisible,
309                                                 vacrelstats->hasindex,
310                                                 new_frozen_xid,
311                                                 new_min_multi,
312                                                 false);
313
314         /* report results to the stats collector, too */
315         new_live_tuples = new_rel_tuples - vacrelstats->new_dead_tuples;
316         if (new_live_tuples < 0)
317                 new_live_tuples = 0;    /* just in case */
318
319         pgstat_report_vacuum(RelationGetRelid(onerel),
320                                                  onerel->rd_rel->relisshared,
321                                                  new_live_tuples,
322                                                  vacrelstats->new_dead_tuples);
323
324         /* and log the action if appropriate */
325         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
326         {
327                 TimestampTz endtime = GetCurrentTimestamp();
328
329                 if (params->log_min_duration == 0 ||
330                         TimestampDifferenceExceeds(starttime, endtime,
331                                                                            params->log_min_duration))
332                 {
333                         StringInfoData buf;
334
335                         TimestampDifference(starttime, endtime, &secs, &usecs);
336
337                         read_rate = 0;
338                         write_rate = 0;
339                         if ((secs > 0) || (usecs > 0))
340                         {
341                                 read_rate = (double) BLCKSZ *VacuumPageMiss / (1024 * 1024) /
342                                                         (secs + usecs / 1000000.0);
343                                 write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) /
344                                                         (secs + usecs / 1000000.0);
345                         }
346
347                         /*
348                          * This is pretty messy, but we split it up so that we can skip
349                          * emitting individual parts of the message when not applicable.
350                          */
351                         initStringInfo(&buf);
352                         appendStringInfo(&buf, _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"),
353                                                          get_database_name(MyDatabaseId),
354                                                          get_namespace_name(RelationGetNamespace(onerel)),
355                                                          RelationGetRelationName(onerel),
356                                                          vacrelstats->num_index_scans);
357                         appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins\n"),
358                                                          vacrelstats->pages_removed,
359                                                          vacrelstats->rel_pages,
360                                                          vacrelstats->pinskipped_pages);
361                         appendStringInfo(&buf,
362                                                          _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable\n"),
363                                                          vacrelstats->tuples_deleted,
364                                                          vacrelstats->new_rel_tuples,
365                                                          vacrelstats->new_dead_tuples);
366                         appendStringInfo(&buf,
367                                                  _("buffer usage: %d hits, %d misses, %d dirtied\n"),
368                                                          VacuumPageHit,
369                                                          VacuumPageMiss,
370                                                          VacuumPageDirty);
371                         appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
372                                                          read_rate, write_rate);
373                         appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
374
375                         ereport(LOG,
376                                         (errmsg_internal("%s", buf.data)));
377                         pfree(buf.data);
378                 }
379         }
380 }
381
382 /*
383  * For Hot Standby we need to know the highest transaction id that will
384  * be removed by any change. VACUUM proceeds in a number of passes so
385  * we need to consider how each pass operates. The first phase runs
386  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
387  * progresses - these will have a latestRemovedXid on each record.
388  * In some cases this removes all of the tuples to be removed, though
389  * often we have dead tuples with index pointers so we must remember them
390  * for removal in phase 3. Index records for those rows are removed
391  * in phase 2 and index blocks do not have MVCC information attached.
392  * So before we can allow removal of any index tuples we need to issue
393  * a WAL record containing the latestRemovedXid of rows that will be
394  * removed in phase three. This allows recovery queries to block at the
395  * correct place, i.e. before phase two, rather than during phase three
396  * which would be after the rows have become inaccessible.
397  */
398 static void
399 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
400 {
401         /*
402          * Skip this for relations for which no WAL is to be written, or if we're
403          * not trying to support archive recovery.
404          */
405         if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
406                 return;
407
408         /*
409          * No need to write the record at all unless it contains a valid value
410          */
411         if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
412                 (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
413 }
414
415 /*
416  *      lazy_scan_heap() -- scan an open heap relation
417  *
418  *              This routine prunes each page in the heap, which will among other
419  *              things truncate dead tuples to dead line pointers, defragment the
420  *              page, and set commit status bits (see heap_page_prune).  It also builds
421  *              lists of dead tuples and pages with free space, calculates statistics
422  *              on the number of live tuples in the heap, and marks pages as
423  *              all-visible if appropriate.  When done, or when we run low on space for
424  *              dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
425  *              to reclaim dead line pointers.
426  *
427  *              If there are no indexes then we can reclaim line pointers on the fly;
428  *              dead line pointers need only be retained until all index pointers that
429  *              reference them have been killed.
430  */
431 static void
432 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
433                            Relation *Irel, int nindexes, bool scan_all)
434 {
435         BlockNumber nblocks,
436                                 blkno;
437         HeapTupleData tuple;
438         char       *relname;
439         BlockNumber empty_pages,
440                                 vacuumed_pages;
441         double          num_tuples,
442                                 tups_vacuumed,
443                                 nkeep,
444                                 nunused;
445         IndexBulkDeleteResult **indstats;
446         int                     i;
447         PGRUsage        ru0;
448         Buffer          vmbuffer = InvalidBuffer;
449         BlockNumber next_not_all_visible_block;
450         bool            skipping_all_visible_blocks;
451         xl_heap_freeze_tuple *frozen;
452         StringInfoData buf;
453
454         pg_rusage_init(&ru0);
455
456         relname = RelationGetRelationName(onerel);
457         ereport(elevel,
458                         (errmsg("vacuuming \"%s.%s\"",
459                                         get_namespace_name(RelationGetNamespace(onerel)),
460                                         relname)));
461
462         empty_pages = vacuumed_pages = 0;
463         num_tuples = tups_vacuumed = nkeep = nunused = 0;
464
465         indstats = (IndexBulkDeleteResult **)
466                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
467
468         nblocks = RelationGetNumberOfBlocks(onerel);
469         vacrelstats->rel_pages = nblocks;
470         vacrelstats->scanned_pages = 0;
471         vacrelstats->nonempty_pages = 0;
472         vacrelstats->latestRemovedXid = InvalidTransactionId;
473
474         lazy_space_alloc(vacrelstats, nblocks);
475         frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
476
477         /*
478          * We want to skip pages that don't require vacuuming according to the
479          * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD
480          * consecutive pages.  Since we're reading sequentially, the OS should be
481          * doing readahead for us, so there's no gain in skipping a page now and
482          * then; that's likely to disable readahead and so be counterproductive.
483          * Also, skipping even a single page means that we can't update
484          * relfrozenxid, so we only want to do it if we can skip a goodly number
485          * of pages.
486          *
487          * Before entering the main loop, establish the invariant that
488          * next_not_all_visible_block is the next block number >= blkno that's not
489          * all-visible according to the visibility map, or nblocks if there's no
490          * such block.  Also, we set up the skipping_all_visible_blocks flag,
491          * which is needed because we need hysteresis in the decision: once we've
492          * started skipping blocks, we may as well skip everything up to the next
493          * not-all-visible block.
494          *
495          * Note: if scan_all is true, we won't actually skip any pages; but we
496          * maintain next_not_all_visible_block anyway, so as to set up the
497          * all_visible_according_to_vm flag correctly for each page.
498          *
499          * Note: The value returned by visibilitymap_test could be slightly
500          * out-of-date, since we make this test before reading the corresponding
501          * heap page or locking the buffer.  This is OK.  If we mistakenly think
502          * that the page is all-visible when in fact the flag's just been cleared,
503          * we might fail to vacuum the page.  But it's OK to skip pages when
504          * scan_all is not set, so no great harm done; the next vacuum will find
505          * them.  If we make the reverse mistake and vacuum a page unnecessarily,
506          * it'll just be a no-op.
507          *
508          * We will scan the table's last page, at least to the extent of
509          * determining whether it has tuples or not, even if it should be skipped
510          * according to the above rules; except when we've already determined that
511          * it's not worth trying to truncate the table.  This avoids having
512          * lazy_truncate_heap() take access-exclusive lock on the table to attempt
513          * a truncation that just fails immediately because there are tuples in
514          * the last page.  This is worth avoiding mainly because such a lock must
515          * be replayed on any hot standby, where it can be disruptive.
516          */
517         for (next_not_all_visible_block = 0;
518                  next_not_all_visible_block < nblocks;
519                  next_not_all_visible_block++)
520         {
521                 if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer))
522                         break;
523                 vacuum_delay_point();
524         }
525         if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD)
526                 skipping_all_visible_blocks = true;
527         else
528                 skipping_all_visible_blocks = false;
529
530         for (blkno = 0; blkno < nblocks; blkno++)
531         {
532                 Buffer          buf;
533                 Page            page;
534                 OffsetNumber offnum,
535                                         maxoff;
536                 bool            tupgone,
537                                         hastup;
538                 int                     prev_dead_count;
539                 int                     nfrozen;
540                 Size            freespace;
541                 bool            all_visible_according_to_vm;
542                 bool            all_visible;
543                 bool            has_dead_tuples;
544                 TransactionId visibility_cutoff_xid = InvalidTransactionId;
545
546                 /* see note above about forcing scanning of last page */
547 #define FORCE_CHECK_PAGE() \
548                 (blkno == nblocks - 1 && should_attempt_truncation(vacrelstats))
549
550                 if (blkno == next_not_all_visible_block)
551                 {
552                         /* Time to advance next_not_all_visible_block */
553                         for (next_not_all_visible_block++;
554                                  next_not_all_visible_block < nblocks;
555                                  next_not_all_visible_block++)
556                         {
557                                 if (!visibilitymap_test(onerel, next_not_all_visible_block,
558                                                                                 &vmbuffer))
559                                         break;
560                                 vacuum_delay_point();
561                         }
562
563                         /*
564                          * We know we can't skip the current block.  But set up
565                          * skipping_all_visible_blocks to do the right thing at the
566                          * following blocks.
567                          */
568                         if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD)
569                                 skipping_all_visible_blocks = true;
570                         else
571                                 skipping_all_visible_blocks = false;
572                         all_visible_according_to_vm = false;
573                 }
574                 else
575                 {
576                         /* Current block is all-visible */
577                         if (skipping_all_visible_blocks && !scan_all && !FORCE_CHECK_PAGE())
578                                 continue;
579                         all_visible_according_to_vm = true;
580                 }
581
582                 vacuum_delay_point();
583
584                 /*
585                  * If we are close to overrunning the available space for dead-tuple
586                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
587                  */
588                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
589                         vacrelstats->num_dead_tuples > 0)
590                 {
591                         /*
592                          * Before beginning index vacuuming, we release any pin we may
593                          * hold on the visibility map page.  This isn't necessary for
594                          * correctness, but we do it anyway to avoid holding the pin
595                          * across a lengthy, unrelated operation.
596                          */
597                         if (BufferIsValid(vmbuffer))
598                         {
599                                 ReleaseBuffer(vmbuffer);
600                                 vmbuffer = InvalidBuffer;
601                         }
602
603                         /* Log cleanup info before we touch indexes */
604                         vacuum_log_cleanup_info(onerel, vacrelstats);
605
606                         /* Remove index entries */
607                         for (i = 0; i < nindexes; i++)
608                                 lazy_vacuum_index(Irel[i],
609                                                                   &indstats[i],
610                                                                   vacrelstats);
611                         /* Remove tuples from heap */
612                         lazy_vacuum_heap(onerel, vacrelstats);
613
614                         /*
615                          * Forget the now-vacuumed tuples, and press on, but be careful
616                          * not to reset latestRemovedXid since we want that value to be
617                          * valid.
618                          */
619                         vacrelstats->num_dead_tuples = 0;
620                         vacrelstats->num_index_scans++;
621                 }
622
623                 /*
624                  * Pin the visibility map page in case we need to mark the page
625                  * all-visible.  In most cases this will be very cheap, because we'll
626                  * already have the correct page pinned anyway.  However, it's
627                  * possible that (a) next_not_all_visible_block is covered by a
628                  * different VM page than the current block or (b) we released our pin
629                  * and did a cycle of index vacuuming.
630                  */
631                 visibilitymap_pin(onerel, blkno, &vmbuffer);
632
633                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
634                                                                  RBM_NORMAL, vac_strategy);
635
636                 /* We need buffer cleanup lock so that we can prune HOT chains. */
637                 if (!ConditionalLockBufferForCleanup(buf))
638                 {
639                         /*
640                          * If we're not scanning the whole relation to guard against XID
641                          * wraparound, and we don't want to forcibly check the page, then
642                          * it's OK to skip vacuuming pages we get a lock conflict on. They
643                          * will be dealt with in some future vacuum.
644                          */
645                         if (!scan_all && !FORCE_CHECK_PAGE())
646                         {
647                                 ReleaseBuffer(buf);
648                                 vacrelstats->pinskipped_pages++;
649                                 continue;
650                         }
651
652                         /*
653                          * Read the page with share lock to see if any xids on it need to
654                          * be frozen.  If not we just skip the page, after updating our
655                          * scan statistics.  If there are some, we wait for cleanup lock.
656                          *
657                          * We could defer the lock request further by remembering the page
658                          * and coming back to it later, or we could even register
659                          * ourselves for multiple buffers and then service whichever one
660                          * is received first.  For now, this seems good enough.
661                          *
662                          * If we get here with scan_all false, then we're just forcibly
663                          * checking the page, and so we don't want to insist on getting
664                          * the lock; we only need to know if the page contains tuples, so
665                          * that we can update nonempty_pages correctly.  It's convenient
666                          * to use lazy_check_needs_freeze() for both situations, though.
667                          */
668                         LockBuffer(buf, BUFFER_LOCK_SHARE);
669                         if (!lazy_check_needs_freeze(buf, &hastup))
670                         {
671                                 UnlockReleaseBuffer(buf);
672                                 vacrelstats->scanned_pages++;
673                                 vacrelstats->pinskipped_pages++;
674                                 if (hastup)
675                                         vacrelstats->nonempty_pages = blkno + 1;
676                                 continue;
677                         }
678                         if (!scan_all)
679                         {
680                                 /*
681                                  * Here, we must not advance scanned_pages; that would amount
682                                  * to claiming that the page contains no freezable tuples.
683                                  */
684                                 UnlockReleaseBuffer(buf);
685                                 vacrelstats->pinskipped_pages++;
686                                 if (hastup)
687                                         vacrelstats->nonempty_pages = blkno + 1;
688                                 continue;
689                         }
690                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
691                         LockBufferForCleanup(buf);
692                         /* drop through to normal processing */
693                 }
694
695                 vacrelstats->scanned_pages++;
696
697                 page = BufferGetPage(buf);
698
699                 if (PageIsNew(page))
700                 {
701                         /*
702                          * An all-zeroes page could be left over if a backend extends the
703                          * relation but crashes before initializing the page. Reclaim such
704                          * pages for use.
705                          *
706                          * We have to be careful here because we could be looking at a
707                          * page that someone has just added to the relation and not yet
708                          * been able to initialize (see RelationGetBufferForTuple). To
709                          * protect against that, release the buffer lock, grab the
710                          * relation extension lock momentarily, and re-lock the buffer. If
711                          * the page is still uninitialized by then, it must be left over
712                          * from a crashed backend, and we can initialize it.
713                          *
714                          * We don't really need the relation lock when this is a new or
715                          * temp relation, but it's probably not worth the code space to
716                          * check that, since this surely isn't a critical path.
717                          *
718                          * Note: the comparable code in vacuum.c need not worry because
719                          * it's got exclusive lock on the whole relation.
720                          */
721                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
722                         LockRelationForExtension(onerel, ExclusiveLock);
723                         UnlockRelationForExtension(onerel, ExclusiveLock);
724                         LockBufferForCleanup(buf);
725                         if (PageIsNew(page))
726                         {
727                                 ereport(WARNING,
728                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
729                                                 relname, blkno)));
730                                 PageInit(page, BufferGetPageSize(buf), 0);
731                                 empty_pages++;
732                         }
733                         freespace = PageGetHeapFreeSpace(page);
734                         MarkBufferDirty(buf);
735                         UnlockReleaseBuffer(buf);
736
737                         RecordPageWithFreeSpace(onerel, blkno, freespace);
738                         continue;
739                 }
740
741                 if (PageIsEmpty(page))
742                 {
743                         empty_pages++;
744                         freespace = PageGetHeapFreeSpace(page);
745
746                         /* empty pages are always all-visible */
747                         if (!PageIsAllVisible(page))
748                         {
749                                 START_CRIT_SECTION();
750
751                                 /* mark buffer dirty before writing a WAL record */
752                                 MarkBufferDirty(buf);
753
754                                 /*
755                                  * It's possible that another backend has extended the heap,
756                                  * initialized the page, and then failed to WAL-log the page
757                                  * due to an ERROR.  Since heap extension is not WAL-logged,
758                                  * recovery might try to replay our record setting the page
759                                  * all-visible and find that the page isn't initialized, which
760                                  * will cause a PANIC.  To prevent that, check whether the
761                                  * page has been previously WAL-logged, and if not, do that
762                                  * now.
763                                  */
764                                 if (RelationNeedsWAL(onerel) &&
765                                         PageGetLSN(page) == InvalidXLogRecPtr)
766                                         log_newpage_buffer(buf, true);
767
768                                 PageSetAllVisible(page);
769                                 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
770                                                                   vmbuffer, InvalidTransactionId);
771                                 END_CRIT_SECTION();
772                         }
773
774                         UnlockReleaseBuffer(buf);
775                         RecordPageWithFreeSpace(onerel, blkno, freespace);
776                         continue;
777                 }
778
779                 /*
780                  * Prune all HOT-update chains in this page.
781                  *
782                  * We count tuples removed by the pruning step as removed by VACUUM.
783                  */
784                 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
785                                                                                  &vacrelstats->latestRemovedXid);
786
787                 /*
788                  * Now scan the page to collect vacuumable items and check for tuples
789                  * requiring freezing.
790                  */
791                 all_visible = true;
792                 has_dead_tuples = false;
793                 nfrozen = 0;
794                 hastup = false;
795                 prev_dead_count = vacrelstats->num_dead_tuples;
796                 maxoff = PageGetMaxOffsetNumber(page);
797
798                 /*
799                  * Note: If you change anything in the loop below, also look at
800                  * heap_page_is_all_visible to see if that needs to be changed.
801                  */
802                 for (offnum = FirstOffsetNumber;
803                          offnum <= maxoff;
804                          offnum = OffsetNumberNext(offnum))
805                 {
806                         ItemId          itemid;
807
808                         itemid = PageGetItemId(page, offnum);
809
810                         /* Unused items require no processing, but we count 'em */
811                         if (!ItemIdIsUsed(itemid))
812                         {
813                                 nunused += 1;
814                                 continue;
815                         }
816
817                         /* Redirect items mustn't be touched */
818                         if (ItemIdIsRedirected(itemid))
819                         {
820                                 hastup = true;  /* this page won't be truncatable */
821                                 continue;
822                         }
823
824                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
825
826                         /*
827                          * DEAD item pointers are to be vacuumed normally; but we don't
828                          * count them in tups_vacuumed, else we'd be double-counting (at
829                          * least in the common case where heap_page_prune() just freed up
830                          * a non-HOT tuple).
831                          */
832                         if (ItemIdIsDead(itemid))
833                         {
834                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
835                                 all_visible = false;
836                                 continue;
837                         }
838
839                         Assert(ItemIdIsNormal(itemid));
840
841                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
842                         tuple.t_len = ItemIdGetLength(itemid);
843                         tuple.t_tableOid = RelationGetRelid(onerel);
844
845                         tupgone = false;
846
847                         switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
848                         {
849                                 case HEAPTUPLE_DEAD:
850
851                                         /*
852                                          * Ordinarily, DEAD tuples would have been removed by
853                                          * heap_page_prune(), but it's possible that the tuple
854                                          * state changed since heap_page_prune() looked.  In
855                                          * particular an INSERT_IN_PROGRESS tuple could have
856                                          * changed to DEAD if the inserter aborted.  So this
857                                          * cannot be considered an error condition.
858                                          *
859                                          * If the tuple is HOT-updated then it must only be
860                                          * removed by a prune operation; so we keep it just as if
861                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
862                                          * tuple, we choose to keep it, because it'll be a lot
863                                          * cheaper to get rid of it in the next pruning pass than
864                                          * to treat it like an indexed tuple.
865                                          */
866                                         if (HeapTupleIsHotUpdated(&tuple) ||
867                                                 HeapTupleIsHeapOnly(&tuple))
868                                                 nkeep += 1;
869                                         else
870                                                 tupgone = true; /* we can delete the tuple */
871                                         all_visible = false;
872                                         break;
873                                 case HEAPTUPLE_LIVE:
874                                         /* Tuple is good --- but let's do some validity checks */
875                                         if (onerel->rd_rel->relhasoids &&
876                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
877                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
878                                                          relname, blkno, offnum);
879
880                                         /*
881                                          * Is the tuple definitely visible to all transactions?
882                                          *
883                                          * NB: Like with per-tuple hint bits, we can't set the
884                                          * PD_ALL_VISIBLE flag if the inserter committed
885                                          * asynchronously. See SetHintBits for more info. Check
886                                          * that the tuple is hinted xmin-committed because of
887                                          * that.
888                                          */
889                                         if (all_visible)
890                                         {
891                                                 TransactionId xmin;
892
893                                                 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
894                                                 {
895                                                         all_visible = false;
896                                                         break;
897                                                 }
898
899                                                 /*
900                                                  * The inserter definitely committed. But is it old
901                                                  * enough that everyone sees it as committed?
902                                                  */
903                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
904                                                 if (!TransactionIdPrecedes(xmin, OldestXmin))
905                                                 {
906                                                         all_visible = false;
907                                                         break;
908                                                 }
909
910                                                 /* Track newest xmin on page. */
911                                                 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
912                                                         visibility_cutoff_xid = xmin;
913                                         }
914                                         break;
915                                 case HEAPTUPLE_RECENTLY_DEAD:
916
917                                         /*
918                                          * If tuple is recently deleted then we must not remove it
919                                          * from relation.
920                                          */
921                                         nkeep += 1;
922                                         all_visible = false;
923                                         break;
924                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
925                                         /* This is an expected case during concurrent vacuum */
926                                         all_visible = false;
927                                         break;
928                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
929                                         /* This is an expected case during concurrent vacuum */
930                                         all_visible = false;
931                                         break;
932                                 default:
933                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
934                                         break;
935                         }
936
937                         if (tupgone)
938                         {
939                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
940                                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
941                                                                                          &vacrelstats->latestRemovedXid);
942                                 tups_vacuumed += 1;
943                                 has_dead_tuples = true;
944                         }
945                         else
946                         {
947                                 num_tuples += 1;
948                                 hastup = true;
949
950                                 /*
951                                  * Each non-removable tuple must be checked to see if it needs
952                                  * freezing.  Note we already have exclusive buffer lock.
953                                  */
954                                 if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
955                                                                                   MultiXactCutoff, &frozen[nfrozen]))
956                                         frozen[nfrozen++].offset = offnum;
957                         }
958                 }                                               /* scan along page */
959
960                 /*
961                  * If we froze any tuples, mark the buffer dirty, and write a WAL
962                  * record recording the changes.  We must log the changes to be
963                  * crash-safe against future truncation of CLOG.
964                  */
965                 if (nfrozen > 0)
966                 {
967                         START_CRIT_SECTION();
968
969                         MarkBufferDirty(buf);
970
971                         /* execute collected freezes */
972                         for (i = 0; i < nfrozen; i++)
973                         {
974                                 ItemId          itemid;
975                                 HeapTupleHeader htup;
976
977                                 itemid = PageGetItemId(page, frozen[i].offset);
978                                 htup = (HeapTupleHeader) PageGetItem(page, itemid);
979
980                                 heap_execute_freeze_tuple(htup, &frozen[i]);
981                         }
982
983                         /* Now WAL-log freezing if necessary */
984                         if (RelationNeedsWAL(onerel))
985                         {
986                                 XLogRecPtr      recptr;
987
988                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
989                                                                                  frozen, nfrozen);
990                                 PageSetLSN(page, recptr);
991                         }
992
993                         END_CRIT_SECTION();
994                 }
995
996                 /*
997                  * If there are no indexes then we can vacuum the page right now
998                  * instead of doing a second scan.
999                  */
1000                 if (nindexes == 0 &&
1001                         vacrelstats->num_dead_tuples > 0)
1002                 {
1003                         /* Remove tuples from heap */
1004                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
1005                         has_dead_tuples = false;
1006
1007                         /*
1008                          * Forget the now-vacuumed tuples, and press on, but be careful
1009                          * not to reset latestRemovedXid since we want that value to be
1010                          * valid.
1011                          */
1012                         vacrelstats->num_dead_tuples = 0;
1013                         vacuumed_pages++;
1014                 }
1015
1016                 freespace = PageGetHeapFreeSpace(page);
1017
1018                 /* mark page all-visible, if appropriate */
1019                 if (all_visible && !all_visible_according_to_vm)
1020                 {
1021                         /*
1022                          * It should never be the case that the visibility map page is set
1023                          * while the page-level bit is clear, but the reverse is allowed
1024                          * (if checksums are not enabled).  Regardless, set the both bits
1025                          * so that we get back in sync.
1026                          *
1027                          * NB: If the heap page is all-visible but the VM bit is not set,
1028                          * we don't need to dirty the heap page.  However, if checksums
1029                          * are enabled, we do need to make sure that the heap page is
1030                          * dirtied before passing it to visibilitymap_set(), because it
1031                          * may be logged.  Given that this situation should only happen in
1032                          * rare cases after a crash, it is not worth optimizing.
1033                          */
1034                         PageSetAllVisible(page);
1035                         MarkBufferDirty(buf);
1036                         visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1037                                                           vmbuffer, visibility_cutoff_xid);
1038                 }
1039
1040                 /*
1041                  * As of PostgreSQL 9.2, the visibility map bit should never be set if
1042                  * the page-level bit is clear.  However, it's possible that the bit
1043                  * got cleared after we checked it and before we took the buffer
1044                  * content lock, so we must recheck before jumping to the conclusion
1045                  * that something bad has happened.
1046                  */
1047                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1048                                  && visibilitymap_test(onerel, blkno, &vmbuffer))
1049                 {
1050                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1051                                  relname, blkno);
1052                         visibilitymap_clear(onerel, blkno, vmbuffer);
1053                 }
1054
1055                 /*
1056                  * It's possible for the value returned by GetOldestXmin() to move
1057                  * backwards, so it's not wrong for us to see tuples that appear to
1058                  * not be visible to everyone yet, while PD_ALL_VISIBLE is already
1059                  * set. The real safe xmin value never moves backwards, but
1060                  * GetOldestXmin() is conservative and sometimes returns a value
1061                  * that's unnecessarily small, so if we see that contradiction it just
1062                  * means that the tuples that we think are not visible to everyone yet
1063                  * actually are, and the PD_ALL_VISIBLE flag is correct.
1064                  *
1065                  * There should never be dead tuples on a page with PD_ALL_VISIBLE
1066                  * set, however.
1067                  */
1068                 else if (PageIsAllVisible(page) && has_dead_tuples)
1069                 {
1070                         elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1071                                  relname, blkno);
1072                         PageClearAllVisible(page);
1073                         MarkBufferDirty(buf);
1074                         visibilitymap_clear(onerel, blkno, vmbuffer);
1075                 }
1076
1077                 UnlockReleaseBuffer(buf);
1078
1079                 /* Remember the location of the last page with nonremovable tuples */
1080                 if (hastup)
1081                         vacrelstats->nonempty_pages = blkno + 1;
1082
1083                 /*
1084                  * If we remembered any tuples for deletion, then the page will be
1085                  * visited again by lazy_vacuum_heap, which will compute and record
1086                  * its post-compaction free space.  If not, then we're done with this
1087                  * page, so remember its free space as-is.  (This path will always be
1088                  * taken if there are no indexes.)
1089                  */
1090                 if (vacrelstats->num_dead_tuples == prev_dead_count)
1091                         RecordPageWithFreeSpace(onerel, blkno, freespace);
1092         }
1093
1094         pfree(frozen);
1095
1096         /* save stats for use later */
1097         vacrelstats->scanned_tuples = num_tuples;
1098         vacrelstats->tuples_deleted = tups_vacuumed;
1099         vacrelstats->new_dead_tuples = nkeep;
1100
1101         /* now we can compute the new value for pg_class.reltuples */
1102         vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
1103                                                                                                                  nblocks,
1104                                                                                                   vacrelstats->scanned_pages,
1105                                                                                                                  num_tuples);
1106
1107         /*
1108          * Release any remaining pin on visibility map page.
1109          */
1110         if (BufferIsValid(vmbuffer))
1111         {
1112                 ReleaseBuffer(vmbuffer);
1113                 vmbuffer = InvalidBuffer;
1114         }
1115
1116         /* If any tuples need to be deleted, perform final vacuum cycle */
1117         /* XXX put a threshold on min number of tuples here? */
1118         if (vacrelstats->num_dead_tuples > 0)
1119         {
1120                 /* Log cleanup info before we touch indexes */
1121                 vacuum_log_cleanup_info(onerel, vacrelstats);
1122
1123                 /* Remove index entries */
1124                 for (i = 0; i < nindexes; i++)
1125                         lazy_vacuum_index(Irel[i],
1126                                                           &indstats[i],
1127                                                           vacrelstats);
1128                 /* Remove tuples from heap */
1129                 lazy_vacuum_heap(onerel, vacrelstats);
1130                 vacrelstats->num_index_scans++;
1131         }
1132
1133         /* Do post-vacuum cleanup and statistics update for each index */
1134         for (i = 0; i < nindexes; i++)
1135                 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
1136
1137         /* If no indexes, make log report that lazy_vacuum_heap would've made */
1138         if (vacuumed_pages)
1139                 ereport(elevel,
1140                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1141                                                 RelationGetRelationName(onerel),
1142                                                 tups_vacuumed, vacuumed_pages)));
1143
1144         /*
1145          * This is pretty messy, but we split it up so that we can skip emitting
1146          * individual parts of the message when not applicable.
1147          */
1148         initStringInfo(&buf);
1149         appendStringInfo(&buf,
1150                                          _("%.0f dead row versions cannot be removed yet.\n"),
1151                                          nkeep);
1152         appendStringInfo(&buf, _("There were %.0f unused item pointers.\n"),
1153                                          nunused);
1154         appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins.\n",
1155                                                                         "Skipped %u pages due to buffer pins.\n",
1156                                                                         vacrelstats->pinskipped_pages),
1157                                          vacrelstats->pinskipped_pages);
1158         appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
1159                                                                         "%u pages are entirely empty.\n",
1160                                                                         empty_pages),
1161                                          empty_pages);
1162         appendStringInfo(&buf, _("%s."),
1163                                          pg_rusage_show(&ru0));
1164
1165         ereport(elevel,
1166                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1167                                         RelationGetRelationName(onerel),
1168                                         tups_vacuumed, num_tuples,
1169                                         vacrelstats->scanned_pages, nblocks),
1170                          errdetail_internal("%s", buf.data)));
1171         pfree(buf.data);
1172 }
1173
1174
1175 /*
1176  *      lazy_vacuum_heap() -- second pass over the heap
1177  *
1178  *              This routine marks dead tuples as unused and compacts out free
1179  *              space on their pages.  Pages not having dead tuples recorded from
1180  *              lazy_scan_heap are not visited at all.
1181  *
1182  * Note: the reason for doing this as a second pass is we cannot remove
1183  * the tuples until we've removed their index entries, and we want to
1184  * process index entry removal in batches as large as possible.
1185  */
1186 static void
1187 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
1188 {
1189         int                     tupindex;
1190         int                     npages;
1191         PGRUsage        ru0;
1192         Buffer          vmbuffer = InvalidBuffer;
1193
1194         pg_rusage_init(&ru0);
1195         npages = 0;
1196
1197         tupindex = 0;
1198         while (tupindex < vacrelstats->num_dead_tuples)
1199         {
1200                 BlockNumber tblk;
1201                 Buffer          buf;
1202                 Page            page;
1203                 Size            freespace;
1204
1205                 vacuum_delay_point();
1206
1207                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1208                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1209                                                                  vac_strategy);
1210                 if (!ConditionalLockBufferForCleanup(buf))
1211                 {
1212                         ReleaseBuffer(buf);
1213                         ++tupindex;
1214                         continue;
1215                 }
1216                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
1217                                                                         &vmbuffer);
1218
1219                 /* Now that we've compacted the page, record its available space */
1220                 page = BufferGetPage(buf);
1221                 freespace = PageGetHeapFreeSpace(page);
1222
1223                 UnlockReleaseBuffer(buf);
1224                 RecordPageWithFreeSpace(onerel, tblk, freespace);
1225                 npages++;
1226         }
1227
1228         if (BufferIsValid(vmbuffer))
1229         {
1230                 ReleaseBuffer(vmbuffer);
1231                 vmbuffer = InvalidBuffer;
1232         }
1233
1234         ereport(elevel,
1235                         (errmsg("\"%s\": removed %d row versions in %d pages",
1236                                         RelationGetRelationName(onerel),
1237                                         tupindex, npages),
1238                          errdetail("%s.",
1239                                            pg_rusage_show(&ru0))));
1240 }
1241
1242 /*
1243  *      lazy_vacuum_page() -- free dead tuples on a page
1244  *                                       and repair its fragmentation.
1245  *
1246  * Caller must hold pin and buffer cleanup lock on the buffer.
1247  *
1248  * tupindex is the index in vacrelstats->dead_tuples of the first dead
1249  * tuple for this page.  We assume the rest follow sequentially.
1250  * The return value is the first tupindex after the tuples of this page.
1251  */
1252 static int
1253 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
1254                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
1255 {
1256         Page            page = BufferGetPage(buffer);
1257         OffsetNumber unused[MaxOffsetNumber];
1258         int                     uncnt = 0;
1259         TransactionId visibility_cutoff_xid;
1260
1261         START_CRIT_SECTION();
1262
1263         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
1264         {
1265                 BlockNumber tblk;
1266                 OffsetNumber toff;
1267                 ItemId          itemid;
1268
1269                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1270                 if (tblk != blkno)
1271                         break;                          /* past end of tuples for this block */
1272                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
1273                 itemid = PageGetItemId(page, toff);
1274                 ItemIdSetUnused(itemid);
1275                 unused[uncnt++] = toff;
1276         }
1277
1278         PageRepairFragmentation(page);
1279
1280         /*
1281          * Mark buffer dirty before we write WAL.
1282          */
1283         MarkBufferDirty(buffer);
1284
1285         /* XLOG stuff */
1286         if (RelationNeedsWAL(onerel))
1287         {
1288                 XLogRecPtr      recptr;
1289
1290                 recptr = log_heap_clean(onerel, buffer,
1291                                                                 NULL, 0, NULL, 0,
1292                                                                 unused, uncnt,
1293                                                                 vacrelstats->latestRemovedXid);
1294                 PageSetLSN(page, recptr);
1295         }
1296
1297         /*
1298          * End critical section, so we safely can do visibility tests (which
1299          * possibly need to perform IO and allocate memory!). If we crash now the
1300          * page (including the corresponding vm bit) might not be marked all
1301          * visible, but that's fine. A later vacuum will fix that.
1302          */
1303         END_CRIT_SECTION();
1304
1305         /*
1306          * Now that we have removed the dead tuples from the page, once again
1307          * check if the page has become all-visible.  The page is already marked
1308          * dirty, exclusively locked, and, if needed, a full page image has been
1309          * emitted in the log_heap_clean() above.
1310          */
1311         if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid))
1312                 PageSetAllVisible(page);
1313
1314         /*
1315          * All the changes to the heap page have been done. If the all-visible
1316          * flag is now set, also set the VM bit.
1317          */
1318         if (PageIsAllVisible(page) &&
1319                 !visibilitymap_test(onerel, blkno, vmbuffer))
1320         {
1321                 Assert(BufferIsValid(*vmbuffer));
1322                 visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer,
1323                                                   visibility_cutoff_xid);
1324         }
1325
1326         return tupindex;
1327 }
1328
1329 /*
1330  *      lazy_check_needs_freeze() -- scan page to see if any tuples
1331  *                                       need to be cleaned to avoid wraparound
1332  *
1333  * Returns true if the page needs to be vacuumed using cleanup lock.
1334  * Also returns a flag indicating whether page contains any tuples at all.
1335  */
1336 static bool
1337 lazy_check_needs_freeze(Buffer buf, bool *hastup)
1338 {
1339         Page            page = BufferGetPage(buf);
1340         OffsetNumber offnum,
1341                                 maxoff;
1342         HeapTupleHeader tupleheader;
1343
1344         *hastup = false;
1345
1346         /* If we hit an uninitialized page, we want to force vacuuming it. */
1347         if (PageIsNew(page))
1348                 return true;
1349
1350         /* Quick out for ordinary empty page. */
1351         if (PageIsEmpty(page))
1352                 return false;
1353
1354         maxoff = PageGetMaxOffsetNumber(page);
1355         for (offnum = FirstOffsetNumber;
1356                  offnum <= maxoff;
1357                  offnum = OffsetNumberNext(offnum))
1358         {
1359                 ItemId          itemid;
1360
1361                 itemid = PageGetItemId(page, offnum);
1362
1363                 /* this should match hastup test in count_nondeletable_pages() */
1364                 if (ItemIdIsUsed(itemid))
1365                         *hastup = true;
1366
1367                 /* dead and redirect items never need freezing */
1368                 if (!ItemIdIsNormal(itemid))
1369                         continue;
1370
1371                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1372
1373                 if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
1374                                                                         MultiXactCutoff, buf))
1375                         return true;
1376         }                                                       /* scan along page */
1377
1378         return false;
1379 }
1380
1381
1382 /*
1383  *      lazy_vacuum_index() -- vacuum one index relation.
1384  *
1385  *              Delete all the index entries pointing to tuples listed in
1386  *              vacrelstats->dead_tuples, and update running statistics.
1387  */
1388 static void
1389 lazy_vacuum_index(Relation indrel,
1390                                   IndexBulkDeleteResult **stats,
1391                                   LVRelStats *vacrelstats)
1392 {
1393         IndexVacuumInfo ivinfo;
1394         PGRUsage        ru0;
1395
1396         pg_rusage_init(&ru0);
1397
1398         ivinfo.index = indrel;
1399         ivinfo.analyze_only = false;
1400         ivinfo.estimated_count = true;
1401         ivinfo.message_level = elevel;
1402         ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
1403         ivinfo.strategy = vac_strategy;
1404
1405         /* Do bulk deletion */
1406         *stats = index_bulk_delete(&ivinfo, *stats,
1407                                                            lazy_tid_reaped, (void *) vacrelstats);
1408
1409         ereport(elevel,
1410                         (errmsg("scanned index \"%s\" to remove %d row versions",
1411                                         RelationGetRelationName(indrel),
1412                                         vacrelstats->num_dead_tuples),
1413                          errdetail("%s.", pg_rusage_show(&ru0))));
1414 }
1415
1416 /*
1417  *      lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
1418  */
1419 static void
1420 lazy_cleanup_index(Relation indrel,
1421                                    IndexBulkDeleteResult *stats,
1422                                    LVRelStats *vacrelstats)
1423 {
1424         IndexVacuumInfo ivinfo;
1425         PGRUsage        ru0;
1426
1427         pg_rusage_init(&ru0);
1428
1429         ivinfo.index = indrel;
1430         ivinfo.analyze_only = false;
1431         ivinfo.estimated_count = (vacrelstats->scanned_pages < vacrelstats->rel_pages);
1432         ivinfo.message_level = elevel;
1433         ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
1434         ivinfo.strategy = vac_strategy;
1435
1436         stats = index_vacuum_cleanup(&ivinfo, stats);
1437
1438         if (!stats)
1439                 return;
1440
1441         /*
1442          * Now update statistics in pg_class, but only if the index says the count
1443          * is accurate.
1444          */
1445         if (!stats->estimated_count)
1446                 vac_update_relstats(indrel,
1447                                                         stats->num_pages,
1448                                                         stats->num_index_tuples,
1449                                                         0,
1450                                                         false,
1451                                                         InvalidTransactionId,
1452                                                         InvalidMultiXactId,
1453                                                         false);
1454
1455         ereport(elevel,
1456                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
1457                                         RelationGetRelationName(indrel),
1458                                         stats->num_index_tuples,
1459                                         stats->num_pages),
1460                          errdetail("%.0f index row versions were removed.\n"
1461                          "%u index pages have been deleted, %u are currently reusable.\n"
1462                                            "%s.",
1463                                            stats->tuples_removed,
1464                                            stats->pages_deleted, stats->pages_free,
1465                                            pg_rusage_show(&ru0))));
1466
1467         pfree(stats);
1468 }
1469
1470 /*
1471  * should_attempt_truncation - should we attempt to truncate the heap?
1472  *
1473  * Don't even think about it unless we have a shot at releasing a goodly
1474  * number of pages.  Otherwise, the time taken isn't worth it.
1475  *
1476  * This is split out so that we can test whether truncation is going to be
1477  * called for before we actually do it.  If you change the logic here, be
1478  * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
1479  */
1480 static bool
1481 should_attempt_truncation(LVRelStats *vacrelstats)
1482 {
1483         BlockNumber possibly_freeable;
1484
1485         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
1486         if (possibly_freeable > 0 &&
1487                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
1488                  possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
1489                 return true;
1490         else
1491                 return false;
1492 }
1493
1494 /*
1495  * lazy_truncate_heap - try to truncate off any empty pages at the end
1496  */
1497 static void
1498 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1499 {
1500         BlockNumber old_rel_pages = vacrelstats->rel_pages;
1501         BlockNumber new_rel_pages;
1502         PGRUsage        ru0;
1503         int                     lock_retry;
1504
1505         pg_rusage_init(&ru0);
1506
1507         /*
1508          * Loop until no more truncating can be done.
1509          */
1510         do
1511         {
1512                 /*
1513                  * We need full exclusive lock on the relation in order to do
1514                  * truncation. If we can't get it, give up rather than waiting --- we
1515                  * don't want to block other backends, and we don't want to deadlock
1516                  * (which is quite possible considering we already hold a lower-grade
1517                  * lock).
1518                  */
1519                 vacrelstats->lock_waiter_detected = false;
1520                 lock_retry = 0;
1521                 while (true)
1522                 {
1523                         if (ConditionalLockRelation(onerel, AccessExclusiveLock))
1524                                 break;
1525
1526                         /*
1527                          * Check for interrupts while trying to (re-)acquire the exclusive
1528                          * lock.
1529                          */
1530                         CHECK_FOR_INTERRUPTS();
1531
1532                         if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
1533                                                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
1534                         {
1535                                 /*
1536                                  * We failed to establish the lock in the specified number of
1537                                  * retries. This means we give up truncating.
1538                                  */
1539                                 vacrelstats->lock_waiter_detected = true;
1540                                 ereport(elevel,
1541                                                 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
1542                                                                 RelationGetRelationName(onerel))));
1543                                 return;
1544                         }
1545
1546                         pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL);
1547                 }
1548
1549                 /*
1550                  * Now that we have exclusive lock, look to see if the rel has grown
1551                  * whilst we were vacuuming with non-exclusive lock.  If so, give up;
1552                  * the newly added pages presumably contain non-deletable tuples.
1553                  */
1554                 new_rel_pages = RelationGetNumberOfBlocks(onerel);
1555                 if (new_rel_pages != old_rel_pages)
1556                 {
1557                         /*
1558                          * Note: we intentionally don't update vacrelstats->rel_pages with
1559                          * the new rel size here.  If we did, it would amount to assuming
1560                          * that the new pages are empty, which is unlikely. Leaving the
1561                          * numbers alone amounts to assuming that the new pages have the
1562                          * same tuple density as existing ones, which is less unlikely.
1563                          */
1564                         UnlockRelation(onerel, AccessExclusiveLock);
1565                         return;
1566                 }
1567
1568                 /*
1569                  * Scan backwards from the end to verify that the end pages actually
1570                  * contain no tuples.  This is *necessary*, not optional, because
1571                  * other backends could have added tuples to these pages whilst we
1572                  * were vacuuming.
1573                  */
1574                 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1575
1576                 if (new_rel_pages >= old_rel_pages)
1577                 {
1578                         /* can't do anything after all */
1579                         UnlockRelation(onerel, AccessExclusiveLock);
1580                         return;
1581                 }
1582
1583                 /*
1584                  * Okay to truncate.
1585                  */
1586                 RelationTruncate(onerel, new_rel_pages);
1587
1588                 /*
1589                  * We can release the exclusive lock as soon as we have truncated.
1590                  * Other backends can't safely access the relation until they have
1591                  * processed the smgr invalidation that smgrtruncate sent out ... but
1592                  * that should happen as part of standard invalidation processing once
1593                  * they acquire lock on the relation.
1594                  */
1595                 UnlockRelation(onerel, AccessExclusiveLock);
1596
1597                 /*
1598                  * Update statistics.  Here, it *is* correct to adjust rel_pages
1599                  * without also touching reltuples, since the tuple count wasn't
1600                  * changed by the truncation.
1601                  */
1602                 vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
1603                 vacrelstats->rel_pages = new_rel_pages;
1604
1605                 ereport(elevel,
1606                                 (errmsg("\"%s\": truncated %u to %u pages",
1607                                                 RelationGetRelationName(onerel),
1608                                                 old_rel_pages, new_rel_pages),
1609                                  errdetail("%s.",
1610                                                    pg_rusage_show(&ru0))));
1611                 old_rel_pages = new_rel_pages;
1612         } while (new_rel_pages > vacrelstats->nonempty_pages &&
1613                          vacrelstats->lock_waiter_detected);
1614 }
1615
1616 /*
1617  * Rescan end pages to verify that they are (still) empty of tuples.
1618  *
1619  * Returns number of nondeletable pages (last nonempty page + 1).
1620  */
1621 static BlockNumber
1622 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1623 {
1624         BlockNumber blkno;
1625         instr_time      starttime;
1626
1627         /* Initialize the starttime if we check for conflicting lock requests */
1628         INSTR_TIME_SET_CURRENT(starttime);
1629
1630         /* Strange coding of loop control is needed because blkno is unsigned */
1631         blkno = vacrelstats->rel_pages;
1632         while (blkno > vacrelstats->nonempty_pages)
1633         {
1634                 Buffer          buf;
1635                 Page            page;
1636                 OffsetNumber offnum,
1637                                         maxoff;
1638                 bool            hastup;
1639
1640                 /*
1641                  * Check if another process requests a lock on our relation. We are
1642                  * holding an AccessExclusiveLock here, so they will be waiting. We
1643                  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
1644                  * only check if that interval has elapsed once every 32 blocks to
1645                  * keep the number of system calls and actual shared lock table
1646                  * lookups to a minimum.
1647                  */
1648                 if ((blkno % 32) == 0)
1649                 {
1650                         instr_time      currenttime;
1651                         instr_time      elapsed;
1652
1653                         INSTR_TIME_SET_CURRENT(currenttime);
1654                         elapsed = currenttime;
1655                         INSTR_TIME_SUBTRACT(elapsed, starttime);
1656                         if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
1657                                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
1658                         {
1659                                 if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
1660                                 {
1661                                         ereport(elevel,
1662                                                         (errmsg("\"%s\": suspending truncate due to conflicting lock request",
1663                                                                         RelationGetRelationName(onerel))));
1664
1665                                         vacrelstats->lock_waiter_detected = true;
1666                                         return blkno;
1667                                 }
1668                                 starttime = currenttime;
1669                         }
1670                 }
1671
1672                 /*
1673                  * We don't insert a vacuum delay point here, because we have an
1674                  * exclusive lock on the table which we want to hold for as short a
1675                  * time as possible.  We still need to check for interrupts however.
1676                  */
1677                 CHECK_FOR_INTERRUPTS();
1678
1679                 blkno--;
1680
1681                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1682                                                                  RBM_NORMAL, vac_strategy);
1683
1684                 /* In this phase we only need shared access to the buffer */
1685                 LockBuffer(buf, BUFFER_LOCK_SHARE);
1686
1687                 page = BufferGetPage(buf);
1688
1689                 if (PageIsNew(page) || PageIsEmpty(page))
1690                 {
1691                         /* PageIsNew probably shouldn't happen... */
1692                         UnlockReleaseBuffer(buf);
1693                         continue;
1694                 }
1695
1696                 hastup = false;
1697                 maxoff = PageGetMaxOffsetNumber(page);
1698                 for (offnum = FirstOffsetNumber;
1699                          offnum <= maxoff;
1700                          offnum = OffsetNumberNext(offnum))
1701                 {
1702                         ItemId          itemid;
1703
1704                         itemid = PageGetItemId(page, offnum);
1705
1706                         /*
1707                          * Note: any non-unused item should be taken as a reason to keep
1708                          * this page.  We formerly thought that DEAD tuples could be
1709                          * thrown away, but that's not so, because we'd not have cleaned
1710                          * out their index entries.
1711                          */
1712                         if (ItemIdIsUsed(itemid))
1713                         {
1714                                 hastup = true;
1715                                 break;                  /* can stop scanning */
1716                         }
1717                 }                                               /* scan along page */
1718
1719                 UnlockReleaseBuffer(buf);
1720
1721                 /* Done scanning if we found a tuple here */
1722                 if (hastup)
1723                         return blkno + 1;
1724         }
1725
1726         /*
1727          * If we fall out of the loop, all the previously-thought-to-be-empty
1728          * pages still are; we need not bother to look at the last known-nonempty
1729          * page.
1730          */
1731         return vacrelstats->nonempty_pages;
1732 }
1733
1734 /*
1735  * lazy_space_alloc - space allocation decisions for lazy vacuum
1736  *
1737  * See the comments at the head of this file for rationale.
1738  */
1739 static void
1740 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1741 {
1742         long            maxtuples;
1743         int                     vac_work_mem = IsAutoVacuumWorkerProcess() &&
1744         autovacuum_work_mem != -1 ?
1745         autovacuum_work_mem : maintenance_work_mem;
1746
1747         if (vacrelstats->hasindex)
1748         {
1749                 maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
1750                 maxtuples = Min(maxtuples, INT_MAX);
1751                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1752
1753                 /* curious coding here to ensure the multiplication can't overflow */
1754                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1755                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1756
1757                 /* stay sane if small maintenance_work_mem */
1758                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1759         }
1760         else
1761         {
1762                 maxtuples = MaxHeapTuplesPerPage;
1763         }
1764
1765         vacrelstats->num_dead_tuples = 0;
1766         vacrelstats->max_dead_tuples = (int) maxtuples;
1767         vacrelstats->dead_tuples = (ItemPointer)
1768                 palloc(maxtuples * sizeof(ItemPointerData));
1769 }
1770
1771 /*
1772  * lazy_record_dead_tuple - remember one deletable tuple
1773  */
1774 static void
1775 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1776                                            ItemPointer itemptr)
1777 {
1778         /*
1779          * The array shouldn't overflow under normal behavior, but perhaps it
1780          * could if we are given a really small maintenance_work_mem. In that
1781          * case, just forget the last few tuples (we'll get 'em next time).
1782          */
1783         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1784         {
1785                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1786                 vacrelstats->num_dead_tuples++;
1787         }
1788 }
1789
1790 /*
1791  *      lazy_tid_reaped() -- is a particular tid deletable?
1792  *
1793  *              This has the right signature to be an IndexBulkDeleteCallback.
1794  *
1795  *              Assumes dead_tuples array is in sorted order.
1796  */
1797 static bool
1798 lazy_tid_reaped(ItemPointer itemptr, void *state)
1799 {
1800         LVRelStats *vacrelstats = (LVRelStats *) state;
1801         ItemPointer res;
1802
1803         res = (ItemPointer) bsearch((void *) itemptr,
1804                                                                 (void *) vacrelstats->dead_tuples,
1805                                                                 vacrelstats->num_dead_tuples,
1806                                                                 sizeof(ItemPointerData),
1807                                                                 vac_cmp_itemptr);
1808
1809         return (res != NULL);
1810 }
1811
1812 /*
1813  * Comparator routines for use with qsort() and bsearch().
1814  */
1815 static int
1816 vac_cmp_itemptr(const void *left, const void *right)
1817 {
1818         BlockNumber lblk,
1819                                 rblk;
1820         OffsetNumber loff,
1821                                 roff;
1822
1823         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1824         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1825
1826         if (lblk < rblk)
1827                 return -1;
1828         if (lblk > rblk)
1829                 return 1;
1830
1831         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1832         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1833
1834         if (loff < roff)
1835                 return -1;
1836         if (loff > roff)
1837                 return 1;
1838
1839         return 0;
1840 }
1841
1842 /*
1843  * Check if every tuple in the given page is visible to all current and future
1844  * transactions. Also return the visibility_cutoff_xid which is the highest
1845  * xmin amongst the visible tuples.
1846  */
1847 static bool
1848 heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid)
1849 {
1850         Page            page = BufferGetPage(buf);
1851         BlockNumber blockno = BufferGetBlockNumber(buf);
1852         OffsetNumber offnum,
1853                                 maxoff;
1854         bool            all_visible = true;
1855
1856         *visibility_cutoff_xid = InvalidTransactionId;
1857
1858         /*
1859          * This is a stripped down version of the line pointer scan in
1860          * lazy_scan_heap(). So if you change anything here, also check that code.
1861          */
1862         maxoff = PageGetMaxOffsetNumber(page);
1863         for (offnum = FirstOffsetNumber;
1864                  offnum <= maxoff && all_visible;
1865                  offnum = OffsetNumberNext(offnum))
1866         {
1867                 ItemId          itemid;
1868                 HeapTupleData tuple;
1869
1870                 itemid = PageGetItemId(page, offnum);
1871
1872                 /* Unused or redirect line pointers are of no interest */
1873                 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
1874                         continue;
1875
1876                 ItemPointerSet(&(tuple.t_self), blockno, offnum);
1877
1878                 /*
1879                  * Dead line pointers can have index pointers pointing to them. So
1880                  * they can't be treated as visible
1881                  */
1882                 if (ItemIdIsDead(itemid))
1883                 {
1884                         all_visible = false;
1885                         break;
1886                 }
1887
1888                 Assert(ItemIdIsNormal(itemid));
1889
1890                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1891                 tuple.t_len = ItemIdGetLength(itemid);
1892                 tuple.t_tableOid = RelationGetRelid(rel);
1893
1894                 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
1895                 {
1896                         case HEAPTUPLE_LIVE:
1897                                 {
1898                                         TransactionId xmin;
1899
1900                                         /* Check comments in lazy_scan_heap. */
1901                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1902                                         {
1903                                                 all_visible = false;
1904                                                 break;
1905                                         }
1906
1907                                         /*
1908                                          * The inserter definitely committed. But is it old enough
1909                                          * that everyone sees it as committed?
1910                                          */
1911                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1912                                         if (!TransactionIdPrecedes(xmin, OldestXmin))
1913                                         {
1914                                                 all_visible = false;
1915                                                 break;
1916                                         }
1917
1918                                         /* Track newest xmin on page. */
1919                                         if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
1920                                                 *visibility_cutoff_xid = xmin;
1921                                 }
1922                                 break;
1923
1924                         case HEAPTUPLE_DEAD:
1925                         case HEAPTUPLE_RECENTLY_DEAD:
1926                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1927                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1928                                 all_visible = false;
1929                                 break;
1930
1931                         default:
1932                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1933                                 break;
1934                 }
1935         }                                                       /* scan along page */
1936
1937         return all_visible;
1938 }