]> granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c
Fix VACUUM's reporting of dead-tuple counts to the stats collector.
[postgresql] / src / backend / commands / vacuumlazy.c
1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  *        Concurrent ("lazy") vacuuming.
5  *
6  *
7  * The major space usage for LAZY VACUUM is storage for the array of dead
8  * tuple TIDs, with the next biggest need being storage for per-disk-page
9  * free space info.  We want to ensure we can vacuum even the very largest
10  * relations with finite memory space usage.  To do that, we set upper bounds
11  * on the number of tuples and pages we will keep track of at once.
12  *
13  * We are willing to use at most maintenance_work_mem (or perhaps
14  * autovacuum_work_mem) memory space to keep track of dead tuples.  We
15  * initially allocate an array of TIDs of that size, with an upper limit that
16  * depends on table size (this limit ensures we don't allocate a huge area
17  * uselessly for vacuuming small tables).  If the array threatens to overflow,
18  * we suspend the heap scan phase and perform a pass of index cleanup and page
19  * compaction, then resume the heap scan with an empty TID array.
20  *
21  * If we're processing a table with no indexes, we can just vacuum each page
22  * as we go; there's no need to save up multiple tuples to minimize the number
23  * of index scans performed.  So we don't use maintenance_work_mem memory for
24  * the TID array, just enough to hold as many heap tuples as fit on one page.
25  *
26  *
27  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
28  * Portions Copyright (c) 1994, Regents of the University of California
29  *
30  *
31  * IDENTIFICATION
32  *        src/backend/commands/vacuumlazy.c
33  *
34  *-------------------------------------------------------------------------
35  */
36 #include "postgres.h"
37
38 #include <math.h>
39
40 #include "access/genam.h"
41 #include "access/heapam.h"
42 #include "access/heapam_xlog.h"
43 #include "access/htup_details.h"
44 #include "access/multixact.h"
45 #include "access/transam.h"
46 #include "access/visibilitymap.h"
47 #include "catalog/storage.h"
48 #include "commands/dbcommands.h"
49 #include "commands/vacuum.h"
50 #include "miscadmin.h"
51 #include "pgstat.h"
52 #include "portability/instr_time.h"
53 #include "postmaster/autovacuum.h"
54 #include "storage/bufmgr.h"
55 #include "storage/freespace.h"
56 #include "storage/lmgr.h"
57 #include "utils/lsyscache.h"
58 #include "utils/memutils.h"
59 #include "utils/pg_rusage.h"
60 #include "utils/timestamp.h"
61 #include "utils/tqual.h"
62
63
64 /*
65  * Space/time tradeoff parameters: do these need to be user-tunable?
66  *
67  * To consider truncating the relation, we want there to be at least
68  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
69  * is less) potentially-freeable pages.
70  */
71 #define REL_TRUNCATE_MINIMUM    1000
72 #define REL_TRUNCATE_FRACTION   16
73
74 /*
75  * Timing parameters for truncate locking heuristics.
76  *
77  * These were not exposed as user tunable GUC values because it didn't seem
78  * that the potential for improvement was great enough to merit the cost of
79  * supporting them.
80  */
81 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL             20              /* ms */
82 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL              50              /* ms */
83 #define VACUUM_TRUNCATE_LOCK_TIMEOUT                    5000    /* ms */
84
85 /*
86  * Guesstimation of number of dead tuples per page.  This is used to
87  * provide an upper limit to memory allocated when vacuuming small
88  * tables.
89  */
90 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
91
92 /*
93  * Before we consider skipping a page that's marked as clean in
94  * visibility map, we must've seen at least this many clean pages.
95  */
96 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
97
98 typedef struct LVRelStats
99 {
100         /* hasindex = true means two-pass strategy; false means one-pass */
101         bool            hasindex;
102         /* Overall statistics about rel */
103         BlockNumber old_rel_pages;      /* previous value of pg_class.relpages */
104         BlockNumber rel_pages;          /* total number of pages */
105         BlockNumber scanned_pages;      /* number of pages we examined */
106         double          scanned_tuples; /* counts only tuples on scanned pages */
107         double          old_rel_tuples; /* previous value of pg_class.reltuples */
108         double          new_rel_tuples; /* new estimated total # of tuples */
109         double          new_dead_tuples;        /* new estimated total # of dead tuples */
110         BlockNumber pages_removed;
111         double          tuples_deleted;
112         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
113         /* List of TIDs of tuples we intend to delete */
114         /* NB: this list is ordered by TID address */
115         int                     num_dead_tuples;        /* current # of entries */
116         int                     max_dead_tuples;        /* # slots allocated in array */
117         ItemPointer dead_tuples;        /* array of ItemPointerData */
118         int                     num_index_scans;
119         TransactionId latestRemovedXid;
120         bool            lock_waiter_detected;
121 } LVRelStats;
122
123
124 /* A few variables that don't seem worth passing around as parameters */
125 static int      elevel = -1;
126
127 static TransactionId OldestXmin;
128 static TransactionId FreezeLimit;
129 static MultiXactId MultiXactCutoff;
130
131 static BufferAccessStrategy vac_strategy;
132
133
134 /* non-export function prototypes */
135 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
136                            Relation *Irel, int nindexes, bool scan_all);
137 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
138 static bool lazy_check_needs_freeze(Buffer buf);
139 static void lazy_vacuum_index(Relation indrel,
140                                   IndexBulkDeleteResult **stats,
141                                   LVRelStats *vacrelstats);
142 static void lazy_cleanup_index(Relation indrel,
143                                    IndexBulkDeleteResult *stats,
144                                    LVRelStats *vacrelstats);
145 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
146                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
147 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
148 static BlockNumber count_nondeletable_pages(Relation onerel,
149                                                  LVRelStats *vacrelstats);
150 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
151 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
152                                            ItemPointer itemptr);
153 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
154 static int      vac_cmp_itemptr(const void *left, const void *right);
155 static bool heap_page_is_all_visible(Relation rel, Buffer buf,
156                                                  TransactionId *visibility_cutoff_xid);
157
158
159 /*
160  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
161  *
162  *              This routine vacuums a single heap, cleans out its indexes, and
163  *              updates its relpages and reltuples statistics.
164  *
165  *              At entry, we have already established a transaction and opened
166  *              and locked the relation.
167  */
168 void
169 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
170                                 BufferAccessStrategy bstrategy)
171 {
172         LVRelStats *vacrelstats;
173         Relation   *Irel;
174         int                     nindexes;
175         BlockNumber possibly_freeable;
176         PGRUsage        ru0;
177         TimestampTz starttime = 0;
178         long            secs;
179         int                     usecs;
180         double          read_rate,
181                                 write_rate;
182         bool            scan_all;               /* should we scan all pages? */
183         bool            scanned_all;    /* did we actually scan all pages? */
184         TransactionId xidFullScanLimit;
185         MultiXactId mxactFullScanLimit;
186         BlockNumber new_rel_pages;
187         double          new_rel_tuples;
188         BlockNumber new_rel_allvisible;
189         double          new_live_tuples;
190         TransactionId new_frozen_xid;
191         MultiXactId new_min_multi;
192
193         /* measure elapsed time iff autovacuum logging requires it */
194         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
195         {
196                 pg_rusage_init(&ru0);
197                 starttime = GetCurrentTimestamp();
198         }
199
200         if (vacstmt->options & VACOPT_VERBOSE)
201                 elevel = INFO;
202         else
203                 elevel = DEBUG2;
204
205         vac_strategy = bstrategy;
206
207         vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
208                                                   onerel->rd_rel->relisshared,
209                                                   &OldestXmin, &FreezeLimit, &xidFullScanLimit,
210                                                   &MultiXactCutoff, &mxactFullScanLimit);
211
212         /*
213          * We request a full scan if either the table's frozen Xid is now older
214          * than or equal to the requested Xid full-table scan limit; or if the
215          * table's minimum MultiXactId is older than or equal to the requested mxid
216          * full-table scan limit.
217          */
218         scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
219                                                                                          xidFullScanLimit);
220         scan_all |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid,
221                                                                                         mxactFullScanLimit);
222
223         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
224
225         vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
226         vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
227         vacrelstats->num_index_scans = 0;
228         vacrelstats->pages_removed = 0;
229         vacrelstats->lock_waiter_detected = false;
230
231         /* Open all indexes of the relation */
232         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
233         vacrelstats->hasindex = (nindexes > 0);
234
235         /* Do the vacuuming */
236         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
237
238         /* Done with indexes */
239         vac_close_indexes(nindexes, Irel, NoLock);
240
241         /*
242          * Compute whether we actually scanned the whole relation. If we did, we
243          * can adjust relfrozenxid and relminmxid.
244          *
245          * NB: We need to check this before truncating the relation, because that
246          * will change ->rel_pages.
247          */
248         if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
249         {
250                 Assert(!scan_all);
251                 scanned_all = false;
252         }
253         else
254                 scanned_all = true;
255
256         /*
257          * Optionally truncate the relation.
258          *
259          * Don't even think about it unless we have a shot at releasing a goodly
260          * number of pages.  Otherwise, the time taken isn't worth it.
261          */
262         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
263         if (possibly_freeable > 0 &&
264                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
265                  possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
266                 lazy_truncate_heap(onerel, vacrelstats);
267
268         /* Vacuum the Free Space Map */
269         FreeSpaceMapVacuum(onerel);
270
271         /*
272          * Update statistics in pg_class.
273          *
274          * A corner case here is that if we scanned no pages at all because every
275          * page is all-visible, we should not update relpages/reltuples, because
276          * we have no new information to contribute.  In particular this keeps us
277          * from replacing relpages=reltuples=0 (which means "unknown tuple
278          * density") with nonzero relpages and reltuples=0 (which means "zero
279          * tuple density") unless there's some actual evidence for the latter.
280          *
281          * We do update relallvisible even in the corner case, since if the table
282          * is all-visible we'd definitely like to know that.  But clamp the value
283          * to be not more than what we're setting relpages to.
284          *
285          * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
286          * since then we don't know for certain that all tuples have a newer xmin.
287          */
288         new_rel_pages = vacrelstats->rel_pages;
289         new_rel_tuples = vacrelstats->new_rel_tuples;
290         if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0)
291         {
292                 new_rel_pages = vacrelstats->old_rel_pages;
293                 new_rel_tuples = vacrelstats->old_rel_tuples;
294         }
295
296         new_rel_allvisible = visibilitymap_count(onerel);
297         if (new_rel_allvisible > new_rel_pages)
298                 new_rel_allvisible = new_rel_pages;
299
300         new_frozen_xid = scanned_all ? FreezeLimit : InvalidTransactionId;
301         new_min_multi = scanned_all ? MultiXactCutoff : InvalidMultiXactId;
302
303         vac_update_relstats(onerel,
304                                                 new_rel_pages,
305                                                 new_rel_tuples,
306                                                 new_rel_allvisible,
307                                                 vacrelstats->hasindex,
308                                                 new_frozen_xid,
309                                                 new_min_multi);
310
311         /* report results to the stats collector, too */
312         new_live_tuples = new_rel_tuples - vacrelstats->new_dead_tuples;
313         if (new_live_tuples < 0)
314                 new_live_tuples = 0;    /* just in case */
315
316         pgstat_report_vacuum(RelationGetRelid(onerel),
317                                                  onerel->rd_rel->relisshared,
318                                                  new_live_tuples,
319                                                  vacrelstats->new_dead_tuples);
320
321         /* and log the action if appropriate */
322         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
323         {
324                 TimestampTz endtime = GetCurrentTimestamp();
325
326                 if (Log_autovacuum_min_duration == 0 ||
327                         TimestampDifferenceExceeds(starttime, endtime,
328                                                                            Log_autovacuum_min_duration))
329                 {
330                         TimestampDifference(starttime, endtime, &secs, &usecs);
331
332                         read_rate = 0;
333                         write_rate = 0;
334                         if ((secs > 0) || (usecs > 0))
335                         {
336                                 read_rate = (double) BLCKSZ *VacuumPageMiss / (1024 * 1024) /
337                                                         (secs + usecs / 1000000.0);
338                                 write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) /
339                                                         (secs + usecs / 1000000.0);
340                         }
341                         ereport(LOG,
342                                         (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
343                                                         "pages: %d removed, %d remain\n"
344                                                         "tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable\n"
345                                                         "buffer usage: %d hits, %d misses, %d dirtied\n"
346                                           "avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"
347                                                         "system usage: %s",
348                                                         get_database_name(MyDatabaseId),
349                                                         get_namespace_name(RelationGetNamespace(onerel)),
350                                                         RelationGetRelationName(onerel),
351                                                         vacrelstats->num_index_scans,
352                                                         vacrelstats->pages_removed,
353                                                         vacrelstats->rel_pages,
354                                                         vacrelstats->tuples_deleted,
355                                                         vacrelstats->new_rel_tuples,
356                                                         vacrelstats->new_dead_tuples,
357                                                         VacuumPageHit,
358                                                         VacuumPageMiss,
359                                                         VacuumPageDirty,
360                                                         read_rate, write_rate,
361                                                         pg_rusage_show(&ru0))));
362                 }
363         }
364 }
365
366 /*
367  * For Hot Standby we need to know the highest transaction id that will
368  * be removed by any change. VACUUM proceeds in a number of passes so
369  * we need to consider how each pass operates. The first phase runs
370  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
371  * progresses - these will have a latestRemovedXid on each record.
372  * In some cases this removes all of the tuples to be removed, though
373  * often we have dead tuples with index pointers so we must remember them
374  * for removal in phase 3. Index records for those rows are removed
375  * in phase 2 and index blocks do not have MVCC information attached.
376  * So before we can allow removal of any index tuples we need to issue
377  * a WAL record containing the latestRemovedXid of rows that will be
378  * removed in phase three. This allows recovery queries to block at the
379  * correct place, i.e. before phase two, rather than during phase three
380  * which would be after the rows have become inaccessible.
381  */
382 static void
383 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
384 {
385         /*
386          * Skip this for relations for which no WAL is to be written, or if we're
387          * not trying to support archive recovery.
388          */
389         if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
390                 return;
391
392         /*
393          * No need to write the record at all unless it contains a valid value
394          */
395         if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
396                 (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
397 }
398
399 /*
400  *      lazy_scan_heap() -- scan an open heap relation
401  *
402  *              This routine prunes each page in the heap, which will among other
403  *              things truncate dead tuples to dead line pointers, defragment the
404  *              page, and set commit status bits (see heap_page_prune).  It also builds
405  *              lists of dead tuples and pages with free space, calculates statistics
406  *              on the number of live tuples in the heap, and marks pages as
407  *              all-visible if appropriate.  When done, or when we run low on space for
408  *              dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
409  *              to reclaim dead line pointers.
410  *
411  *              If there are no indexes then we can reclaim line pointers on the fly;
412  *              dead line pointers need only be retained until all index pointers that
413  *              reference them have been killed.
414  */
415 static void
416 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
417                            Relation *Irel, int nindexes, bool scan_all)
418 {
419         BlockNumber nblocks,
420                                 blkno;
421         HeapTupleData tuple;
422         char       *relname;
423         BlockNumber empty_pages,
424                                 vacuumed_pages;
425         double          num_tuples,
426                                 tups_vacuumed,
427                                 nkeep,
428                                 nunused;
429         IndexBulkDeleteResult **indstats;
430         int                     i;
431         PGRUsage        ru0;
432         Buffer          vmbuffer = InvalidBuffer;
433         BlockNumber next_not_all_visible_block;
434         bool            skipping_all_visible_blocks;
435         xl_heap_freeze_tuple *frozen;
436
437         pg_rusage_init(&ru0);
438
439         relname = RelationGetRelationName(onerel);
440         ereport(elevel,
441                         (errmsg("vacuuming \"%s.%s\"",
442                                         get_namespace_name(RelationGetNamespace(onerel)),
443                                         relname)));
444
445         empty_pages = vacuumed_pages = 0;
446         num_tuples = tups_vacuumed = nkeep = nunused = 0;
447
448         indstats = (IndexBulkDeleteResult **)
449                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
450
451         nblocks = RelationGetNumberOfBlocks(onerel);
452         vacrelstats->rel_pages = nblocks;
453         vacrelstats->scanned_pages = 0;
454         vacrelstats->nonempty_pages = 0;
455         vacrelstats->latestRemovedXid = InvalidTransactionId;
456
457         lazy_space_alloc(vacrelstats, nblocks);
458         frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
459
460         /*
461          * We want to skip pages that don't require vacuuming according to the
462          * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD
463          * consecutive pages.  Since we're reading sequentially, the OS should be
464          * doing readahead for us, so there's no gain in skipping a page now and
465          * then; that's likely to disable readahead and so be counterproductive.
466          * Also, skipping even a single page means that we can't update
467          * relfrozenxid, so we only want to do it if we can skip a goodly number
468          * of pages.
469          *
470          * Before entering the main loop, establish the invariant that
471          * next_not_all_visible_block is the next block number >= blkno that's not
472          * all-visible according to the visibility map, or nblocks if there's no
473          * such block.  Also, we set up the skipping_all_visible_blocks flag,
474          * which is needed because we need hysteresis in the decision: once we've
475          * started skipping blocks, we may as well skip everything up to the next
476          * not-all-visible block.
477          *
478          * Note: if scan_all is true, we won't actually skip any pages; but we
479          * maintain next_not_all_visible_block anyway, so as to set up the
480          * all_visible_according_to_vm flag correctly for each page.
481          *
482          * Note: The value returned by visibilitymap_test could be slightly
483          * out-of-date, since we make this test before reading the corresponding
484          * heap page or locking the buffer.  This is OK.  If we mistakenly think
485          * that the page is all-visible when in fact the flag's just been cleared,
486          * we might fail to vacuum the page.  But it's OK to skip pages when
487          * scan_all is not set, so no great harm done; the next vacuum will find
488          * them.  If we make the reverse mistake and vacuum a page unnecessarily,
489          * it'll just be a no-op.
490          */
491         for (next_not_all_visible_block = 0;
492                  next_not_all_visible_block < nblocks;
493                  next_not_all_visible_block++)
494         {
495                 if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer))
496                         break;
497                 vacuum_delay_point();
498         }
499         if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD)
500                 skipping_all_visible_blocks = true;
501         else
502                 skipping_all_visible_blocks = false;
503
504         for (blkno = 0; blkno < nblocks; blkno++)
505         {
506                 Buffer          buf;
507                 Page            page;
508                 OffsetNumber offnum,
509                                         maxoff;
510                 bool            tupgone,
511                                         hastup;
512                 int                     prev_dead_count;
513                 int                     nfrozen;
514                 Size            freespace;
515                 bool            all_visible_according_to_vm;
516                 bool            all_visible;
517                 bool            has_dead_tuples;
518                 TransactionId visibility_cutoff_xid = InvalidTransactionId;
519
520                 if (blkno == next_not_all_visible_block)
521                 {
522                         /* Time to advance next_not_all_visible_block */
523                         for (next_not_all_visible_block++;
524                                  next_not_all_visible_block < nblocks;
525                                  next_not_all_visible_block++)
526                         {
527                                 if (!visibilitymap_test(onerel, next_not_all_visible_block,
528                                                                                 &vmbuffer))
529                                         break;
530                                 vacuum_delay_point();
531                         }
532
533                         /*
534                          * We know we can't skip the current block.  But set up
535                          * skipping_all_visible_blocks to do the right thing at the
536                          * following blocks.
537                          */
538                         if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD)
539                                 skipping_all_visible_blocks = true;
540                         else
541                                 skipping_all_visible_blocks = false;
542                         all_visible_according_to_vm = false;
543                 }
544                 else
545                 {
546                         /* Current block is all-visible */
547                         if (skipping_all_visible_blocks && !scan_all)
548                                 continue;
549                         all_visible_according_to_vm = true;
550                 }
551
552                 vacuum_delay_point();
553
554                 /*
555                  * If we are close to overrunning the available space for dead-tuple
556                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
557                  */
558                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
559                         vacrelstats->num_dead_tuples > 0)
560                 {
561                         /*
562                          * Before beginning index vacuuming, we release any pin we may
563                          * hold on the visibility map page.  This isn't necessary for
564                          * correctness, but we do it anyway to avoid holding the pin
565                          * across a lengthy, unrelated operation.
566                          */
567                         if (BufferIsValid(vmbuffer))
568                         {
569                                 ReleaseBuffer(vmbuffer);
570                                 vmbuffer = InvalidBuffer;
571                         }
572
573                         /* Log cleanup info before we touch indexes */
574                         vacuum_log_cleanup_info(onerel, vacrelstats);
575
576                         /* Remove index entries */
577                         for (i = 0; i < nindexes; i++)
578                                 lazy_vacuum_index(Irel[i],
579                                                                   &indstats[i],
580                                                                   vacrelstats);
581                         /* Remove tuples from heap */
582                         lazy_vacuum_heap(onerel, vacrelstats);
583
584                         /*
585                          * Forget the now-vacuumed tuples, and press on, but be careful
586                          * not to reset latestRemovedXid since we want that value to be
587                          * valid.
588                          */
589                         vacrelstats->num_dead_tuples = 0;
590                         vacrelstats->num_index_scans++;
591                 }
592
593                 /*
594                  * Pin the visibility map page in case we need to mark the page
595                  * all-visible.  In most cases this will be very cheap, because we'll
596                  * already have the correct page pinned anyway.  However, it's
597                  * possible that (a) next_not_all_visible_block is covered by a
598                  * different VM page than the current block or (b) we released our pin
599                  * and did a cycle of index vacuuming.
600                  */
601                 visibilitymap_pin(onerel, blkno, &vmbuffer);
602
603                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
604                                                                  RBM_NORMAL, vac_strategy);
605
606                 /* We need buffer cleanup lock so that we can prune HOT chains. */
607                 if (!ConditionalLockBufferForCleanup(buf))
608                 {
609                         /*
610                          * If we're not scanning the whole relation to guard against XID
611                          * wraparound, it's OK to skip vacuuming a page.  The next vacuum
612                          * will clean it up.
613                          */
614                         if (!scan_all)
615                         {
616                                 ReleaseBuffer(buf);
617                                 continue;
618                         }
619
620                         /*
621                          * If this is a wraparound checking vacuum, then we read the page
622                          * with share lock to see if any xids need to be frozen. If the
623                          * page doesn't need attention we just skip and continue. If it
624                          * does, we wait for cleanup lock.
625                          *
626                          * We could defer the lock request further by remembering the page
627                          * and coming back to it later, or we could even register
628                          * ourselves for multiple buffers and then service whichever one
629                          * is received first.  For now, this seems good enough.
630                          */
631                         LockBuffer(buf, BUFFER_LOCK_SHARE);
632                         if (!lazy_check_needs_freeze(buf))
633                         {
634                                 UnlockReleaseBuffer(buf);
635                                 vacrelstats->scanned_pages++;
636                                 continue;
637                         }
638                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
639                         LockBufferForCleanup(buf);
640                         /* drop through to normal processing */
641                 }
642
643                 vacrelstats->scanned_pages++;
644
645                 page = BufferGetPage(buf);
646
647                 if (PageIsNew(page))
648                 {
649                         /*
650                          * An all-zeroes page could be left over if a backend extends the
651                          * relation but crashes before initializing the page. Reclaim such
652                          * pages for use.
653                          *
654                          * We have to be careful here because we could be looking at a
655                          * page that someone has just added to the relation and not yet
656                          * been able to initialize (see RelationGetBufferForTuple). To
657                          * protect against that, release the buffer lock, grab the
658                          * relation extension lock momentarily, and re-lock the buffer. If
659                          * the page is still uninitialized by then, it must be left over
660                          * from a crashed backend, and we can initialize it.
661                          *
662                          * We don't really need the relation lock when this is a new or
663                          * temp relation, but it's probably not worth the code space to
664                          * check that, since this surely isn't a critical path.
665                          *
666                          * Note: the comparable code in vacuum.c need not worry because
667                          * it's got exclusive lock on the whole relation.
668                          */
669                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
670                         LockRelationForExtension(onerel, ExclusiveLock);
671                         UnlockRelationForExtension(onerel, ExclusiveLock);
672                         LockBufferForCleanup(buf);
673                         if (PageIsNew(page))
674                         {
675                                 ereport(WARNING,
676                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
677                                                 relname, blkno)));
678                                 PageInit(page, BufferGetPageSize(buf), 0);
679                                 empty_pages++;
680                         }
681                         freespace = PageGetHeapFreeSpace(page);
682                         MarkBufferDirty(buf);
683                         UnlockReleaseBuffer(buf);
684
685                         RecordPageWithFreeSpace(onerel, blkno, freespace);
686                         continue;
687                 }
688
689                 if (PageIsEmpty(page))
690                 {
691                         empty_pages++;
692                         freespace = PageGetHeapFreeSpace(page);
693
694                         /* empty pages are always all-visible */
695                         if (!PageIsAllVisible(page))
696                         {
697                                 START_CRIT_SECTION();
698
699                                 /* mark buffer dirty before writing a WAL record */
700                                 MarkBufferDirty(buf);
701
702                                 /*
703                                  * It's possible that another backend has extended the heap,
704                                  * initialized the page, and then failed to WAL-log the page
705                                  * due to an ERROR.  Since heap extension is not WAL-logged,
706                                  * recovery might try to replay our record setting the
707                                  * page all-visible and find that the page isn't initialized,
708                                  * which will cause a PANIC.  To prevent that, check whether
709                                  * the page has been previously WAL-logged, and if not, do that
710                                  * now.
711                                  */
712                                 if (RelationNeedsWAL(onerel) &&
713                                         PageGetLSN(page) == InvalidXLogRecPtr)
714                                         log_newpage_buffer(buf, true);
715
716                                 PageSetAllVisible(page);
717                                 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
718                                                                   vmbuffer, InvalidTransactionId);
719                                 END_CRIT_SECTION();
720                         }
721
722                         UnlockReleaseBuffer(buf);
723                         RecordPageWithFreeSpace(onerel, blkno, freespace);
724                         continue;
725                 }
726
727                 /*
728                  * Prune all HOT-update chains in this page.
729                  *
730                  * We count tuples removed by the pruning step as removed by VACUUM.
731                  */
732                 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
733                                                                                  &vacrelstats->latestRemovedXid);
734
735                 /*
736                  * Now scan the page to collect vacuumable items and check for tuples
737                  * requiring freezing.
738                  */
739                 all_visible = true;
740                 has_dead_tuples = false;
741                 nfrozen = 0;
742                 hastup = false;
743                 prev_dead_count = vacrelstats->num_dead_tuples;
744                 maxoff = PageGetMaxOffsetNumber(page);
745
746                 /*
747                  * Note: If you change anything in the loop below, also look at
748                  * heap_page_is_all_visible to see if that needs to be changed.
749                  */
750                 for (offnum = FirstOffsetNumber;
751                          offnum <= maxoff;
752                          offnum = OffsetNumberNext(offnum))
753                 {
754                         ItemId          itemid;
755
756                         itemid = PageGetItemId(page, offnum);
757
758                         /* Unused items require no processing, but we count 'em */
759                         if (!ItemIdIsUsed(itemid))
760                         {
761                                 nunused += 1;
762                                 continue;
763                         }
764
765                         /* Redirect items mustn't be touched */
766                         if (ItemIdIsRedirected(itemid))
767                         {
768                                 hastup = true;  /* this page won't be truncatable */
769                                 continue;
770                         }
771
772                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
773
774                         /*
775                          * DEAD item pointers are to be vacuumed normally; but we don't
776                          * count them in tups_vacuumed, else we'd be double-counting (at
777                          * least in the common case where heap_page_prune() just freed up
778                          * a non-HOT tuple).
779                          */
780                         if (ItemIdIsDead(itemid))
781                         {
782                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
783                                 all_visible = false;
784                                 continue;
785                         }
786
787                         Assert(ItemIdIsNormal(itemid));
788
789                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
790                         tuple.t_len = ItemIdGetLength(itemid);
791                         tuple.t_tableOid = RelationGetRelid(onerel);
792
793                         tupgone = false;
794
795                         switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
796                         {
797                                 case HEAPTUPLE_DEAD:
798
799                                         /*
800                                          * Ordinarily, DEAD tuples would have been removed by
801                                          * heap_page_prune(), but it's possible that the tuple
802                                          * state changed since heap_page_prune() looked.  In
803                                          * particular an INSERT_IN_PROGRESS tuple could have
804                                          * changed to DEAD if the inserter aborted.  So this
805                                          * cannot be considered an error condition.
806                                          *
807                                          * If the tuple is HOT-updated then it must only be
808                                          * removed by a prune operation; so we keep it just as if
809                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
810                                          * tuple, we choose to keep it, because it'll be a lot
811                                          * cheaper to get rid of it in the next pruning pass than
812                                          * to treat it like an indexed tuple.
813                                          */
814                                         if (HeapTupleIsHotUpdated(&tuple) ||
815                                                 HeapTupleIsHeapOnly(&tuple))
816                                                 nkeep += 1;
817                                         else
818                                                 tupgone = true; /* we can delete the tuple */
819                                         all_visible = false;
820                                         break;
821                                 case HEAPTUPLE_LIVE:
822                                         /* Tuple is good --- but let's do some validity checks */
823                                         if (onerel->rd_rel->relhasoids &&
824                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
825                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
826                                                          relname, blkno, offnum);
827
828                                         /*
829                                          * Is the tuple definitely visible to all transactions?
830                                          *
831                                          * NB: Like with per-tuple hint bits, we can't set the
832                                          * PD_ALL_VISIBLE flag if the inserter committed
833                                          * asynchronously. See SetHintBits for more info. Check
834                                          * that the tuple is hinted xmin-committed because
835                                          * of that.
836                                          */
837                                         if (all_visible)
838                                         {
839                                                 TransactionId xmin;
840
841                                                 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
842                                                 {
843                                                         all_visible = false;
844                                                         break;
845                                                 }
846
847                                                 /*
848                                                  * The inserter definitely committed. But is it old
849                                                  * enough that everyone sees it as committed?
850                                                  */
851                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
852                                                 if (!TransactionIdPrecedes(xmin, OldestXmin))
853                                                 {
854                                                         all_visible = false;
855                                                         break;
856                                                 }
857
858                                                 /* Track newest xmin on page. */
859                                                 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
860                                                         visibility_cutoff_xid = xmin;
861                                         }
862                                         break;
863                                 case HEAPTUPLE_RECENTLY_DEAD:
864
865                                         /*
866                                          * If tuple is recently deleted then we must not remove it
867                                          * from relation.
868                                          */
869                                         nkeep += 1;
870                                         all_visible = false;
871                                         break;
872                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
873                                         /* This is an expected case during concurrent vacuum */
874                                         all_visible = false;
875                                         break;
876                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
877                                         /* This is an expected case during concurrent vacuum */
878                                         all_visible = false;
879                                         break;
880                                 default:
881                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
882                                         break;
883                         }
884
885                         if (tupgone)
886                         {
887                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
888                                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
889                                                                                          &vacrelstats->latestRemovedXid);
890                                 tups_vacuumed += 1;
891                                 has_dead_tuples = true;
892                         }
893                         else
894                         {
895                                 num_tuples += 1;
896                                 hastup = true;
897
898                                 /*
899                                  * Each non-removable tuple must be checked to see if it needs
900                                  * freezing.  Note we already have exclusive buffer lock.
901                                  */
902                                 if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
903                                                                                   MultiXactCutoff, &frozen[nfrozen]))
904                                         frozen[nfrozen++].offset = offnum;
905                         }
906                 }                                               /* scan along page */
907
908                 /*
909                  * If we froze any tuples, mark the buffer dirty, and write a WAL
910                  * record recording the changes.  We must log the changes to be
911                  * crash-safe against future truncation of CLOG.
912                  */
913                 if (nfrozen > 0)
914                 {
915                         START_CRIT_SECTION();
916
917                         MarkBufferDirty(buf);
918
919                         /* execute collected freezes */
920                         for (i = 0; i < nfrozen; i++)
921                         {
922                                 ItemId          itemid;
923                                 HeapTupleHeader htup;
924
925                                 itemid = PageGetItemId(page, frozen[i].offset);
926                                 htup = (HeapTupleHeader) PageGetItem(page, itemid);
927
928                                 heap_execute_freeze_tuple(htup, &frozen[i]);
929                         }
930
931                         /* Now WAL-log freezing if neccessary */
932                         if (RelationNeedsWAL(onerel))
933                         {
934                                 XLogRecPtr      recptr;
935
936                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
937                                                                                  frozen, nfrozen);
938                                 PageSetLSN(page, recptr);
939                         }
940
941                         END_CRIT_SECTION();
942                 }
943
944                 /*
945                  * If there are no indexes then we can vacuum the page right now
946                  * instead of doing a second scan.
947                  */
948                 if (nindexes == 0 &&
949                         vacrelstats->num_dead_tuples > 0)
950                 {
951                         /* Remove tuples from heap */
952                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
953                         has_dead_tuples = false;
954
955                         /*
956                          * Forget the now-vacuumed tuples, and press on, but be careful
957                          * not to reset latestRemovedXid since we want that value to be
958                          * valid.
959                          */
960                         vacrelstats->num_dead_tuples = 0;
961                         vacuumed_pages++;
962                 }
963
964                 freespace = PageGetHeapFreeSpace(page);
965
966                 /* mark page all-visible, if appropriate */
967                 if (all_visible && !all_visible_according_to_vm)
968                 {
969                         /*
970                          * It should never be the case that the visibility map page is set
971                          * while the page-level bit is clear, but the reverse is allowed
972                          * (if checksums are not enabled).      Regardless, set the both bits
973                          * so that we get back in sync.
974                          *
975                          * NB: If the heap page is all-visible but the VM bit is not set,
976                          * we don't need to dirty the heap page.  However, if checksums
977                          * are enabled, we do need to make sure that the heap page is
978                          * dirtied before passing it to visibilitymap_set(), because it
979                          * may be logged.  Given that this situation should only happen in
980                          * rare cases after a crash, it is not worth optimizing.
981                          */
982                         PageSetAllVisible(page);
983                         MarkBufferDirty(buf);
984                         visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
985                                                           vmbuffer, visibility_cutoff_xid);
986                 }
987
988                 /*
989                  * As of PostgreSQL 9.2, the visibility map bit should never be set if
990                  * the page-level bit is clear.  However, it's possible that the bit
991                  * got cleared after we checked it and before we took the buffer
992                  * content lock, so we must recheck before jumping to the conclusion
993                  * that something bad has happened.
994                  */
995                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
996                                  && visibilitymap_test(onerel, blkno, &vmbuffer))
997                 {
998                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
999                                  relname, blkno);
1000                         visibilitymap_clear(onerel, blkno, vmbuffer);
1001                 }
1002
1003                 /*
1004                  * It's possible for the value returned by GetOldestXmin() to move
1005                  * backwards, so it's not wrong for us to see tuples that appear to
1006                  * not be visible to everyone yet, while PD_ALL_VISIBLE is already
1007                  * set. The real safe xmin value never moves backwards, but
1008                  * GetOldestXmin() is conservative and sometimes returns a value
1009                  * that's unnecessarily small, so if we see that contradiction it just
1010                  * means that the tuples that we think are not visible to everyone yet
1011                  * actually are, and the PD_ALL_VISIBLE flag is correct.
1012                  *
1013                  * There should never be dead tuples on a page with PD_ALL_VISIBLE
1014                  * set, however.
1015                  */
1016                 else if (PageIsAllVisible(page) && has_dead_tuples)
1017                 {
1018                         elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1019                                  relname, blkno);
1020                         PageClearAllVisible(page);
1021                         MarkBufferDirty(buf);
1022                         visibilitymap_clear(onerel, blkno, vmbuffer);
1023                 }
1024
1025                 UnlockReleaseBuffer(buf);
1026
1027                 /* Remember the location of the last page with nonremovable tuples */
1028                 if (hastup)
1029                         vacrelstats->nonempty_pages = blkno + 1;
1030
1031                 /*
1032                  * If we remembered any tuples for deletion, then the page will be
1033                  * visited again by lazy_vacuum_heap, which will compute and record
1034                  * its post-compaction free space.      If not, then we're done with this
1035                  * page, so remember its free space as-is.      (This path will always be
1036                  * taken if there are no indexes.)
1037                  */
1038                 if (vacrelstats->num_dead_tuples == prev_dead_count)
1039                         RecordPageWithFreeSpace(onerel, blkno, freespace);
1040         }
1041
1042         pfree(frozen);
1043
1044         /* save stats for use later */
1045         vacrelstats->scanned_tuples = num_tuples;
1046         vacrelstats->tuples_deleted = tups_vacuumed;
1047         vacrelstats->new_dead_tuples = nkeep;
1048
1049         /* now we can compute the new value for pg_class.reltuples */
1050         vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
1051                                                                                                                  nblocks,
1052                                                                                                   vacrelstats->scanned_pages,
1053                                                                                                                  num_tuples);
1054
1055         /*
1056          * Release any remaining pin on visibility map page.
1057          */
1058         if (BufferIsValid(vmbuffer))
1059         {
1060                 ReleaseBuffer(vmbuffer);
1061                 vmbuffer = InvalidBuffer;
1062         }
1063
1064         /* If any tuples need to be deleted, perform final vacuum cycle */
1065         /* XXX put a threshold on min number of tuples here? */
1066         if (vacrelstats->num_dead_tuples > 0)
1067         {
1068                 /* Log cleanup info before we touch indexes */
1069                 vacuum_log_cleanup_info(onerel, vacrelstats);
1070
1071                 /* Remove index entries */
1072                 for (i = 0; i < nindexes; i++)
1073                         lazy_vacuum_index(Irel[i],
1074                                                           &indstats[i],
1075                                                           vacrelstats);
1076                 /* Remove tuples from heap */
1077                 lazy_vacuum_heap(onerel, vacrelstats);
1078                 vacrelstats->num_index_scans++;
1079         }
1080
1081         /* Do post-vacuum cleanup and statistics update for each index */
1082         for (i = 0; i < nindexes; i++)
1083                 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
1084
1085         /* If no indexes, make log report that lazy_vacuum_heap would've made */
1086         if (vacuumed_pages)
1087                 ereport(elevel,
1088                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1089                                                 RelationGetRelationName(onerel),
1090                                                 tups_vacuumed, vacuumed_pages)));
1091
1092         ereport(elevel,
1093                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1094                                         RelationGetRelationName(onerel),
1095                                         tups_vacuumed, num_tuples,
1096                                         vacrelstats->scanned_pages, nblocks),
1097                          errdetail("%.0f dead row versions cannot be removed yet.\n"
1098                                            "There were %.0f unused item pointers.\n"
1099                                            "%u pages are entirely empty.\n"
1100                                            "%s.",
1101                                            nkeep,
1102                                            nunused,
1103                                            empty_pages,
1104                                            pg_rusage_show(&ru0))));
1105 }
1106
1107
1108 /*
1109  *      lazy_vacuum_heap() -- second pass over the heap
1110  *
1111  *              This routine marks dead tuples as unused and compacts out free
1112  *              space on their pages.  Pages not having dead tuples recorded from
1113  *              lazy_scan_heap are not visited at all.
1114  *
1115  * Note: the reason for doing this as a second pass is we cannot remove
1116  * the tuples until we've removed their index entries, and we want to
1117  * process index entry removal in batches as large as possible.
1118  */
1119 static void
1120 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
1121 {
1122         int                     tupindex;
1123         int                     npages;
1124         PGRUsage        ru0;
1125         Buffer          vmbuffer = InvalidBuffer;
1126
1127         pg_rusage_init(&ru0);
1128         npages = 0;
1129
1130         tupindex = 0;
1131         while (tupindex < vacrelstats->num_dead_tuples)
1132         {
1133                 BlockNumber tblk;
1134                 Buffer          buf;
1135                 Page            page;
1136                 Size            freespace;
1137
1138                 vacuum_delay_point();
1139
1140                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1141                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1142                                                                  vac_strategy);
1143                 if (!ConditionalLockBufferForCleanup(buf))
1144                 {
1145                         ReleaseBuffer(buf);
1146                         ++tupindex;
1147                         continue;
1148                 }
1149                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
1150                                                                         &vmbuffer);
1151
1152                 /* Now that we've compacted the page, record its available space */
1153                 page = BufferGetPage(buf);
1154                 freespace = PageGetHeapFreeSpace(page);
1155
1156                 UnlockReleaseBuffer(buf);
1157                 RecordPageWithFreeSpace(onerel, tblk, freespace);
1158                 npages++;
1159         }
1160
1161         if (BufferIsValid(vmbuffer))
1162         {
1163                 ReleaseBuffer(vmbuffer);
1164                 vmbuffer = InvalidBuffer;
1165         }
1166
1167         ereport(elevel,
1168                         (errmsg("\"%s\": removed %d row versions in %d pages",
1169                                         RelationGetRelationName(onerel),
1170                                         tupindex, npages),
1171                          errdetail("%s.",
1172                                            pg_rusage_show(&ru0))));
1173 }
1174
1175 /*
1176  *      lazy_vacuum_page() -- free dead tuples on a page
1177  *                                       and repair its fragmentation.
1178  *
1179  * Caller must hold pin and buffer cleanup lock on the buffer.
1180  *
1181  * tupindex is the index in vacrelstats->dead_tuples of the first dead
1182  * tuple for this page.  We assume the rest follow sequentially.
1183  * The return value is the first tupindex after the tuples of this page.
1184  */
1185 static int
1186 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
1187                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
1188 {
1189         Page            page = BufferGetPage(buffer);
1190         OffsetNumber unused[MaxOffsetNumber];
1191         int                     uncnt = 0;
1192         TransactionId visibility_cutoff_xid;
1193
1194         START_CRIT_SECTION();
1195
1196         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
1197         {
1198                 BlockNumber tblk;
1199                 OffsetNumber toff;
1200                 ItemId          itemid;
1201
1202                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1203                 if (tblk != blkno)
1204                         break;                          /* past end of tuples for this block */
1205                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
1206                 itemid = PageGetItemId(page, toff);
1207                 ItemIdSetUnused(itemid);
1208                 unused[uncnt++] = toff;
1209         }
1210
1211         PageRepairFragmentation(page);
1212
1213         /*
1214          * Mark buffer dirty before we write WAL.
1215          */
1216         MarkBufferDirty(buffer);
1217
1218         /* XLOG stuff */
1219         if (RelationNeedsWAL(onerel))
1220         {
1221                 XLogRecPtr      recptr;
1222
1223                 recptr = log_heap_clean(onerel, buffer,
1224                                                                 NULL, 0, NULL, 0,
1225                                                                 unused, uncnt,
1226                                                                 vacrelstats->latestRemovedXid);
1227                 PageSetLSN(page, recptr);
1228         }
1229
1230         /*
1231          * Now that we have removed the dead tuples from the page, once again
1232          * check if the page has become all-visible.
1233          */
1234         if (!visibilitymap_test(onerel, blkno, vmbuffer) &&
1235                 heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid))
1236         {
1237                 Assert(BufferIsValid(*vmbuffer));
1238                 PageSetAllVisible(page);
1239                 visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer,
1240                                                   visibility_cutoff_xid);
1241         }
1242
1243         END_CRIT_SECTION();
1244
1245         return tupindex;
1246 }
1247
1248 /*
1249  *      lazy_check_needs_freeze() -- scan page to see if any tuples
1250  *                                       need to be cleaned to avoid wraparound
1251  *
1252  * Returns true if the page needs to be vacuumed using cleanup lock.
1253  */
1254 static bool
1255 lazy_check_needs_freeze(Buffer buf)
1256 {
1257         Page            page;
1258         OffsetNumber offnum,
1259                                 maxoff;
1260         HeapTupleHeader tupleheader;
1261
1262         page = BufferGetPage(buf);
1263
1264         if (PageIsNew(page) || PageIsEmpty(page))
1265         {
1266                 /* PageIsNew probably shouldn't happen... */
1267                 return false;
1268         }
1269
1270         maxoff = PageGetMaxOffsetNumber(page);
1271         for (offnum = FirstOffsetNumber;
1272                  offnum <= maxoff;
1273                  offnum = OffsetNumberNext(offnum))
1274         {
1275                 ItemId          itemid;
1276
1277                 itemid = PageGetItemId(page, offnum);
1278
1279                 if (!ItemIdIsNormal(itemid))
1280                         continue;
1281
1282                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1283
1284                 if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
1285                                                                         MultiXactCutoff, buf))
1286                         return true;
1287         }                                                       /* scan along page */
1288
1289         return false;
1290 }
1291
1292
1293 /*
1294  *      lazy_vacuum_index() -- vacuum one index relation.
1295  *
1296  *              Delete all the index entries pointing to tuples listed in
1297  *              vacrelstats->dead_tuples, and update running statistics.
1298  */
1299 static void
1300 lazy_vacuum_index(Relation indrel,
1301                                   IndexBulkDeleteResult **stats,
1302                                   LVRelStats *vacrelstats)
1303 {
1304         IndexVacuumInfo ivinfo;
1305         PGRUsage        ru0;
1306
1307         pg_rusage_init(&ru0);
1308
1309         ivinfo.index = indrel;
1310         ivinfo.analyze_only = false;
1311         ivinfo.estimated_count = true;
1312         ivinfo.message_level = elevel;
1313         ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
1314         ivinfo.strategy = vac_strategy;
1315
1316         /* Do bulk deletion */
1317         *stats = index_bulk_delete(&ivinfo, *stats,
1318                                                            lazy_tid_reaped, (void *) vacrelstats);
1319
1320         ereport(elevel,
1321                         (errmsg("scanned index \"%s\" to remove %d row versions",
1322                                         RelationGetRelationName(indrel),
1323                                         vacrelstats->num_dead_tuples),
1324                          errdetail("%s.", pg_rusage_show(&ru0))));
1325 }
1326
1327 /*
1328  *      lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
1329  */
1330 static void
1331 lazy_cleanup_index(Relation indrel,
1332                                    IndexBulkDeleteResult *stats,
1333                                    LVRelStats *vacrelstats)
1334 {
1335         IndexVacuumInfo ivinfo;
1336         PGRUsage        ru0;
1337
1338         pg_rusage_init(&ru0);
1339
1340         ivinfo.index = indrel;
1341         ivinfo.analyze_only = false;
1342         ivinfo.estimated_count = (vacrelstats->scanned_pages < vacrelstats->rel_pages);
1343         ivinfo.message_level = elevel;
1344         ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
1345         ivinfo.strategy = vac_strategy;
1346
1347         stats = index_vacuum_cleanup(&ivinfo, stats);
1348
1349         if (!stats)
1350                 return;
1351
1352         /*
1353          * Now update statistics in pg_class, but only if the index says the count
1354          * is accurate.
1355          */
1356         if (!stats->estimated_count)
1357                 vac_update_relstats(indrel,
1358                                                         stats->num_pages,
1359                                                         stats->num_index_tuples,
1360                                                         0,
1361                                                         false,
1362                                                         InvalidTransactionId,
1363                                                         InvalidMultiXactId);
1364
1365         ereport(elevel,
1366                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
1367                                         RelationGetRelationName(indrel),
1368                                         stats->num_index_tuples,
1369                                         stats->num_pages),
1370                          errdetail("%.0f index row versions were removed.\n"
1371                          "%u index pages have been deleted, %u are currently reusable.\n"
1372                                            "%s.",
1373                                            stats->tuples_removed,
1374                                            stats->pages_deleted, stats->pages_free,
1375                                            pg_rusage_show(&ru0))));
1376
1377         pfree(stats);
1378 }
1379
1380 /*
1381  * lazy_truncate_heap - try to truncate off any empty pages at the end
1382  */
1383 static void
1384 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1385 {
1386         BlockNumber old_rel_pages = vacrelstats->rel_pages;
1387         BlockNumber new_rel_pages;
1388         PGRUsage        ru0;
1389         int                     lock_retry;
1390
1391         pg_rusage_init(&ru0);
1392
1393         /*
1394          * Loop until no more truncating can be done.
1395          */
1396         do
1397         {
1398                 /*
1399                  * We need full exclusive lock on the relation in order to do
1400                  * truncation. If we can't get it, give up rather than waiting --- we
1401                  * don't want to block other backends, and we don't want to deadlock
1402                  * (which is quite possible considering we already hold a lower-grade
1403                  * lock).
1404                  */
1405                 vacrelstats->lock_waiter_detected = false;
1406                 lock_retry = 0;
1407                 while (true)
1408                 {
1409                         if (ConditionalLockRelation(onerel, AccessExclusiveLock))
1410                                 break;
1411
1412                         /*
1413                          * Check for interrupts while trying to (re-)acquire the exclusive
1414                          * lock.
1415                          */
1416                         CHECK_FOR_INTERRUPTS();
1417
1418                         if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
1419                                                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
1420                         {
1421                                 /*
1422                                  * We failed to establish the lock in the specified number of
1423                                  * retries. This means we give up truncating.
1424                                  */
1425                                 vacrelstats->lock_waiter_detected = true;
1426                                 ereport(elevel,
1427                                                 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
1428                                                                 RelationGetRelationName(onerel))));
1429                                 return;
1430                         }
1431
1432                         pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL);
1433                 }
1434
1435                 /*
1436                  * Now that we have exclusive lock, look to see if the rel has grown
1437                  * whilst we were vacuuming with non-exclusive lock.  If so, give up;
1438                  * the newly added pages presumably contain non-deletable tuples.
1439                  */
1440                 new_rel_pages = RelationGetNumberOfBlocks(onerel);
1441                 if (new_rel_pages != old_rel_pages)
1442                 {
1443                         /*
1444                          * Note: we intentionally don't update vacrelstats->rel_pages with
1445                          * the new rel size here.  If we did, it would amount to assuming
1446                          * that the new pages are empty, which is unlikely. Leaving the
1447                          * numbers alone amounts to assuming that the new pages have the
1448                          * same tuple density as existing ones, which is less unlikely.
1449                          */
1450                         UnlockRelation(onerel, AccessExclusiveLock);
1451                         return;
1452                 }
1453
1454                 /*
1455                  * Scan backwards from the end to verify that the end pages actually
1456                  * contain no tuples.  This is *necessary*, not optional, because
1457                  * other backends could have added tuples to these pages whilst we
1458                  * were vacuuming.
1459                  */
1460                 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1461
1462                 if (new_rel_pages >= old_rel_pages)
1463                 {
1464                         /* can't do anything after all */
1465                         UnlockRelation(onerel, AccessExclusiveLock);
1466                         return;
1467                 }
1468
1469                 /*
1470                  * Okay to truncate.
1471                  */
1472                 RelationTruncate(onerel, new_rel_pages);
1473
1474                 /*
1475                  * We can release the exclusive lock as soon as we have truncated.
1476                  * Other backends can't safely access the relation until they have
1477                  * processed the smgr invalidation that smgrtruncate sent out ... but
1478                  * that should happen as part of standard invalidation processing once
1479                  * they acquire lock on the relation.
1480                  */
1481                 UnlockRelation(onerel, AccessExclusiveLock);
1482
1483                 /*
1484                  * Update statistics.  Here, it *is* correct to adjust rel_pages
1485                  * without also touching reltuples, since the tuple count wasn't
1486                  * changed by the truncation.
1487                  */
1488                 vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
1489                 vacrelstats->rel_pages = new_rel_pages;
1490
1491                 ereport(elevel,
1492                                 (errmsg("\"%s\": truncated %u to %u pages",
1493                                                 RelationGetRelationName(onerel),
1494                                                 old_rel_pages, new_rel_pages),
1495                                  errdetail("%s.",
1496                                                    pg_rusage_show(&ru0))));
1497                 old_rel_pages = new_rel_pages;
1498         } while (new_rel_pages > vacrelstats->nonempty_pages &&
1499                          vacrelstats->lock_waiter_detected);
1500 }
1501
1502 /*
1503  * Rescan end pages to verify that they are (still) empty of tuples.
1504  *
1505  * Returns number of nondeletable pages (last nonempty page + 1).
1506  */
1507 static BlockNumber
1508 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1509 {
1510         BlockNumber blkno;
1511         instr_time      starttime;
1512
1513         /* Initialize the starttime if we check for conflicting lock requests */
1514         INSTR_TIME_SET_CURRENT(starttime);
1515
1516         /* Strange coding of loop control is needed because blkno is unsigned */
1517         blkno = vacrelstats->rel_pages;
1518         while (blkno > vacrelstats->nonempty_pages)
1519         {
1520                 Buffer          buf;
1521                 Page            page;
1522                 OffsetNumber offnum,
1523                                         maxoff;
1524                 bool            hastup;
1525
1526                 /*
1527                  * Check if another process requests a lock on our relation. We are
1528                  * holding an AccessExclusiveLock here, so they will be waiting. We
1529                  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
1530                  * only check if that interval has elapsed once every 32 blocks to
1531                  * keep the number of system calls and actual shared lock table
1532                  * lookups to a minimum.
1533                  */
1534                 if ((blkno % 32) == 0)
1535                 {
1536                         instr_time      currenttime;
1537                         instr_time      elapsed;
1538
1539                         INSTR_TIME_SET_CURRENT(currenttime);
1540                         elapsed = currenttime;
1541                         INSTR_TIME_SUBTRACT(elapsed, starttime);
1542                         if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
1543                                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
1544                         {
1545                                 if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
1546                                 {
1547                                         ereport(elevel,
1548                                                         (errmsg("\"%s\": suspending truncate due to conflicting lock request",
1549                                                                         RelationGetRelationName(onerel))));
1550
1551                                         vacrelstats->lock_waiter_detected = true;
1552                                         return blkno;
1553                                 }
1554                                 starttime = currenttime;
1555                         }
1556                 }
1557
1558                 /*
1559                  * We don't insert a vacuum delay point here, because we have an
1560                  * exclusive lock on the table which we want to hold for as short a
1561                  * time as possible.  We still need to check for interrupts however.
1562                  */
1563                 CHECK_FOR_INTERRUPTS();
1564
1565                 blkno--;
1566
1567                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1568                                                                  RBM_NORMAL, vac_strategy);
1569
1570                 /* In this phase we only need shared access to the buffer */
1571                 LockBuffer(buf, BUFFER_LOCK_SHARE);
1572
1573                 page = BufferGetPage(buf);
1574
1575                 if (PageIsNew(page) || PageIsEmpty(page))
1576                 {
1577                         /* PageIsNew probably shouldn't happen... */
1578                         UnlockReleaseBuffer(buf);
1579                         continue;
1580                 }
1581
1582                 hastup = false;
1583                 maxoff = PageGetMaxOffsetNumber(page);
1584                 for (offnum = FirstOffsetNumber;
1585                          offnum <= maxoff;
1586                          offnum = OffsetNumberNext(offnum))
1587                 {
1588                         ItemId          itemid;
1589
1590                         itemid = PageGetItemId(page, offnum);
1591
1592                         /*
1593                          * Note: any non-unused item should be taken as a reason to keep
1594                          * this page.  We formerly thought that DEAD tuples could be
1595                          * thrown away, but that's not so, because we'd not have cleaned
1596                          * out their index entries.
1597                          */
1598                         if (ItemIdIsUsed(itemid))
1599                         {
1600                                 hastup = true;
1601                                 break;                  /* can stop scanning */
1602                         }
1603                 }                                               /* scan along page */
1604
1605                 UnlockReleaseBuffer(buf);
1606
1607                 /* Done scanning if we found a tuple here */
1608                 if (hastup)
1609                         return blkno + 1;
1610         }
1611
1612         /*
1613          * If we fall out of the loop, all the previously-thought-to-be-empty
1614          * pages still are; we need not bother to look at the last known-nonempty
1615          * page.
1616          */
1617         return vacrelstats->nonempty_pages;
1618 }
1619
1620 /*
1621  * lazy_space_alloc - space allocation decisions for lazy vacuum
1622  *
1623  * See the comments at the head of this file for rationale.
1624  */
1625 static void
1626 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1627 {
1628         long            maxtuples;
1629         int                     vac_work_mem =  IsAutoVacuumWorkerProcess() &&
1630                                                                         autovacuum_work_mem != -1 ?
1631                                                                 autovacuum_work_mem : maintenance_work_mem;
1632
1633         if (vacrelstats->hasindex)
1634         {
1635                 maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
1636                 maxtuples = Min(maxtuples, INT_MAX);
1637                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1638
1639                 /* curious coding here to ensure the multiplication can't overflow */
1640                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1641                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1642
1643                 /* stay sane if small maintenance_work_mem */
1644                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1645         }
1646         else
1647         {
1648                 maxtuples = MaxHeapTuplesPerPage;
1649         }
1650
1651         vacrelstats->num_dead_tuples = 0;
1652         vacrelstats->max_dead_tuples = (int) maxtuples;
1653         vacrelstats->dead_tuples = (ItemPointer)
1654                 palloc(maxtuples * sizeof(ItemPointerData));
1655 }
1656
1657 /*
1658  * lazy_record_dead_tuple - remember one deletable tuple
1659  */
1660 static void
1661 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1662                                            ItemPointer itemptr)
1663 {
1664         /*
1665          * The array shouldn't overflow under normal behavior, but perhaps it
1666          * could if we are given a really small maintenance_work_mem. In that
1667          * case, just forget the last few tuples (we'll get 'em next time).
1668          */
1669         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1670         {
1671                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1672                 vacrelstats->num_dead_tuples++;
1673         }
1674 }
1675
1676 /*
1677  *      lazy_tid_reaped() -- is a particular tid deletable?
1678  *
1679  *              This has the right signature to be an IndexBulkDeleteCallback.
1680  *
1681  *              Assumes dead_tuples array is in sorted order.
1682  */
1683 static bool
1684 lazy_tid_reaped(ItemPointer itemptr, void *state)
1685 {
1686         LVRelStats *vacrelstats = (LVRelStats *) state;
1687         ItemPointer res;
1688
1689         res = (ItemPointer) bsearch((void *) itemptr,
1690                                                                 (void *) vacrelstats->dead_tuples,
1691                                                                 vacrelstats->num_dead_tuples,
1692                                                                 sizeof(ItemPointerData),
1693                                                                 vac_cmp_itemptr);
1694
1695         return (res != NULL);
1696 }
1697
1698 /*
1699  * Comparator routines for use with qsort() and bsearch().
1700  */
1701 static int
1702 vac_cmp_itemptr(const void *left, const void *right)
1703 {
1704         BlockNumber lblk,
1705                                 rblk;
1706         OffsetNumber loff,
1707                                 roff;
1708
1709         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1710         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1711
1712         if (lblk < rblk)
1713                 return -1;
1714         if (lblk > rblk)
1715                 return 1;
1716
1717         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1718         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1719
1720         if (loff < roff)
1721                 return -1;
1722         if (loff > roff)
1723                 return 1;
1724
1725         return 0;
1726 }
1727
1728 /*
1729  * Check if every tuple in the given page is visible to all current and future
1730  * transactions. Also return the visibility_cutoff_xid which is the highest
1731  * xmin amongst the visible tuples.
1732  */
1733 static bool
1734 heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid)
1735 {
1736         Page            page = BufferGetPage(buf);
1737         OffsetNumber offnum,
1738                                 maxoff;
1739         bool            all_visible = true;
1740
1741         *visibility_cutoff_xid = InvalidTransactionId;
1742
1743         /*
1744          * This is a stripped down version of the line pointer scan in
1745          * lazy_scan_heap(). So if you change anything here, also check that code.
1746          */
1747         maxoff = PageGetMaxOffsetNumber(page);
1748         for (offnum = FirstOffsetNumber;
1749                  offnum <= maxoff && all_visible;
1750                  offnum = OffsetNumberNext(offnum))
1751         {
1752                 ItemId          itemid;
1753                 HeapTupleData tuple;
1754
1755                 itemid = PageGetItemId(page, offnum);
1756
1757                 /* Unused or redirect line pointers are of no interest */
1758                 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
1759                         continue;
1760
1761                 ItemPointerSet(&(tuple.t_self), BufferGetBlockNumber(buf), offnum);
1762
1763                 /*
1764                  * Dead line pointers can have index pointers pointing to them. So
1765                  * they can't be treated as visible
1766                  */
1767                 if (ItemIdIsDead(itemid))
1768                 {
1769                         all_visible = false;
1770                         break;
1771                 }
1772
1773                 Assert(ItemIdIsNormal(itemid));
1774
1775                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1776                 tuple.t_len = ItemIdGetLength(itemid);
1777                 tuple.t_tableOid = RelationGetRelid(rel);
1778
1779                 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
1780                 {
1781                         case HEAPTUPLE_LIVE:
1782                                 {
1783                                         TransactionId xmin;
1784
1785                                         /* Check comments in lazy_scan_heap. */
1786                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1787                                         {
1788                                                 all_visible = false;
1789                                                 break;
1790                                         }
1791
1792                                         /*
1793                                          * The inserter definitely committed. But is it old enough
1794                                          * that everyone sees it as committed?
1795                                          */
1796                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1797                                         if (!TransactionIdPrecedes(xmin, OldestXmin))
1798                                         {
1799                                                 all_visible = false;
1800                                                 break;
1801                                         }
1802
1803                                         /* Track newest xmin on page. */
1804                                         if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
1805                                                 *visibility_cutoff_xid = xmin;
1806                                 }
1807                                 break;
1808
1809                         case HEAPTUPLE_DEAD:
1810                         case HEAPTUPLE_RECENTLY_DEAD:
1811                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1812                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1813                                 all_visible = false;
1814                                 break;
1815
1816                         default:
1817                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1818                                 break;
1819                 }
1820         }                                                       /* scan along page */
1821
1822         return all_visible;
1823 }